scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20796B)
      1 {
      2   "paper": {
      3     "title": "ChatGPT Agent System Card",
      4     "authors": ["OpenAI"],
      5     "year": 2025,
      6     "venue": "OpenAI Technical Report"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No source code or repository is released. The paper describes internal evaluation infrastructure but provides no public code."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No evaluation datasets are released. The paper references internal benchmarks, production benchmarks, and proprietary evaluation sets without providing download links."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No environment specifications, dependency lists, or setup instructions are provided."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No reproduction instructions are included. Most evaluations use internal infrastructure and proprietary data that cannot be replicated externally."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section 5.1.3.2 states: 'We calculate 95% confidence intervals for pass@1 using the standard bootstrap procedure.' Confidence intervals are reported for SWE-bench results."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper makes many comparative claims (e.g., 'ChatGPT agent generally outperforms o3') but no statistical significance tests are reported for these comparisons."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Results are reported with absolute scores and baselines (e.g., specific percentages for ChatGPT agent vs o3 on multiple benchmarks), providing context for the magnitude of differences."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Sample sizes are stated (e.g., n=477 for SWE-bench Verified, 97 multiple-choice questions) but no justification or power analysis is provided for why these sizes are adequate. One exception: Section 5.2.2.5 mentions 'we explicitly considered the statistical power reflected in our study design' for the novice uplift study, but no power analysis details are given."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Most results are reported as single point estimates. The SWE-bench section mentions averaging over 4 tries per instance but does not report standard deviation or variance across runs for most benchmarks."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "OpenAI o3, o4-mini, and GPT-4o are consistently used as baselines across nearly all evaluations."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "All baselines are OpenAI's own models (o3, o4-mini, GPT-4o). No external baselines from competing systems (Claude, Gemini, Llama) are included, making it impossible to assess performance relative to the broader landscape."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No ablation study isolates the contribution of individual components (deep research, Operator browsing, terminal tool, connectors). The system is evaluated as a whole."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple metrics are used across different evaluation domains: not_unsafe scores, accuracy, pass@1, SimpleQA accuracy, hallucination rates, BBQ bias scores, and various safety-specific metrics."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Human evaluation is used in several places: human red-teamers for biosecurity (Section 5.2.2.5), human graders for biorisk questions (Section 5.2.1.1), and the novice uplift study with 59 human participants."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No explicit discussion of held-out test sets vs development sets. The paper does not clarify whether reported results are on data used for any tuning or selection decisions."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down by category extensively: disallowed content by category (hate, sexual, etc.), biosecurity by threat creation stage, CTF by difficulty tier, coding by benchmark type."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Several failure modes are discussed: SimpleQA accuracy lower than o3 due to search behavior (Section 2.3), agent failing to recover from tool failures in biosecurity tasks (Section 5.2.1.3), and specific CTF failure patterns."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Negative results are reported: ChatGPT agent scores lower than o3 on SimpleQA accuracy, shows a drop vs o3 on some biosecurity hard questions with safety mitigations, and 'no model is able to solve the scenario unaided' in certain cyber evaluations."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The introduction's claims about combining deep research, Operator, and terminal capabilities with safety protections are supported by the detailed evaluation sections that follow."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The paper makes causal claims about safety mitigations (e.g., mitigations reduce harmful outputs) but does not provide controlled ablation of specific mitigations to support these claims rigorously. Results are shown with and without mitigations in some cases but the study design is not clearly controlled."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper claims the system has 'our strongest suite yet of end-to-end safety protections' but evaluations cover limited scenarios. The generalization from specific benchmarks to real-world safety is not explicitly bounded."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No substantive discussion of alternative explanations for the results. For example, when ChatGPT agent outperforms o3 on some biosecurity tasks, the paper does not discuss whether this is due to browsing capability, model improvements, or other confounds."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper refers to 'ChatGPT agent', 'OpenAI o3', 'o4-mini', and 'GPT-4o' without specific version identifiers, snapshot dates, or API version strings."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No actual prompts or system instructions used in evaluations are provided. Evaluation methodology is described in natural language only."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the evaluations."
    142       },
    143       "scaffolding_described": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The agentic scaffolding is described at a high level: browsing via Operator, terminal tool, connectors, deep research capability. Section 5.1.3.2 mentions 'an internal tool scaffold designed for efficient iterative file editing and debugging' for o3/o4-mini. The system's multi-layered safety stack is described in detail."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Data preprocessing for evaluation datasets is not documented. For example, how production benchmark conversations were selected and processed is not described."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 5.3 ('Remaining Risks') serves as a limitations section, discussing residual risks including unknown jailbreaks, incremental information leaking, and trajectory visibility."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Specific threats are discussed: 'there is a risk of previously unknown universal jailbreaks' (Section 5.3), limitations of confidence intervals for SWE-bench (sampling variance vs problem-level variance), and limitations of the novice uplift study design."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the evaluations do NOT show. There is no equivalent of 'what the evidence does not demonstrate' — the Remaining Risks section discusses potential failures but does not bound the scope of safety claims."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw evaluation data is available for independent verification. All results are reported in aggregate."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Data collection is described for several evaluations: SWE-bench Verified uses a validated subset of n=477, production benchmarks are described as representative of production data, biorisk rubric was developed collaboratively with external biosecurity organizations."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "For the novice uplift study (Section 5.2.2.5), recruitment is described: 59 participants across two rounds, with 179 total submissions, though selection criteria for participants are not detailed. Red team participants from external biosecurity organizations (Gryphon Scientific, SecureBio) are named."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The full pipeline from raw evaluation outputs to reported metrics is not documented. Processing steps, filtering criteria, and any exclusions are not described."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The paper is authored by 'OpenAI' and this is clearly stated. The conflict — OpenAI evaluating its own product — is self-evident from the authorship."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "OpenAI is evaluating its own commercial product (ChatGPT Agent). The funder/producer has a direct financial interest in the outcome of these safety evaluations."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No training data cutoff date is stated for ChatGPT agent or the comparison models."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No discussion of whether benchmark data (SWE-bench, SimpleQA, BBQ, etc.) appeared in the model's training data."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Several benchmarks used (SWE-bench, BBQ, SimpleQA) are publicly available and may have been in training data. This is not discussed."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The novice uplift study (Section 5.2.2.5) involves 59 human participants but no pre-registration is mentioned."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No IRB or ethics board approval is mentioned for the novice uplift study involving human participants."
    242       },
    243       "demographics_reported": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No demographic information is provided about the 59 participants in the novice uplift study beyond being described as 'novices'."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No inclusion or exclusion criteria are described for participant selection in the novice uplift study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "The novice uplift study is not a randomized controlled experiment comparing treatment vs control; it tests whether participants can achieve harmful outcomes with the model."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "Blinding is not feasible for a study where participants knowingly interact with a specific AI system to test its safety."
    262       },
    263       "attrition_reported": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "59 participants submitted 179 submissions but no attrition data is reported — it is unclear if any participants dropped out or were excluded."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No inference costs, token usage, or latency figures are reported for any of the evaluations despite extensive use of multiple models across many benchmarks."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No total computational budget, GPU hours, or API costs are stated for the evaluation campaigns."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "ChatGPT agent performs at parity with OpenAI o3 on the standard disallowed content evaluation and generally outperforms o3 on the more challenging production benchmark set.",
    285       "evidence": "Tables 1 and 2 in Section 2.1 show detailed per-category not_unsafe scores comparing ChatGPT agent to o3.",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "ChatGPT agent scores lower on SimpleQA accuracy than o3 but has a lower hallucination rate.",
    290       "evidence": "Section 2.3 reports accuracy and hallucination scores, with manual investigation attributing the gap to search behavior and grading rubric edge cases.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "ChatGPT agent significantly outperformed o3 on hard biosecurity questions that o3 typically answers incorrectly.",
    295       "evidence": "Section 5.2.1.3 reports SecureBio evaluation: agent answered 4 hard questions correctly vs o3's average of ~0 across 10 runs. However, with safety mitigations the agent shows a performance drop.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "The model does not provide meaningful uplift to novices attempting to create biological threats.",
    300       "evidence": "Section 5.2.2.5 describes a novice uplift study with 59 participants across two rounds. Of 179 submissions, 16 were flagged as concerning but none were assessed to demonstrate significant uplift.",
    301       "supported": "weak"
    302     },
    303     {
    304       "claim": "ChatGPT Agent performs comparably to o3 and o4-mini across CTF challenge tiers.",
    305       "evidence": "Section 5.1.2 reports CTF results with 12 attempts per task showing comparable performance across difficulty tiers.",
    306       "supported": "moderate"
    307     }
    308   ],
    309   "methodology_tags": ["benchmark-eval", "case-study"],
    310   "key_findings": "This is OpenAI's system card for ChatGPT Agent, a unified agentic system combining deep research, web browsing (Operator), and terminal access. Safety evaluations show the model performs at parity or slightly better than o3 on most safety benchmarks while demonstrating enhanced capabilities on challenging tasks. OpenAI chose a precautionary 'High' classification under their Preparedness Framework for biological risks, despite not having definitive evidence of meaningful novice uplift. The paper describes a multi-layered safety stack including safety training, automated monitors, user confirmations, and a rapid remediation protocol.",
    311   "red_flags": [
    312     {
    313       "flag": "Company evaluating own product",
    314       "detail": "OpenAI is the sole evaluator of its own commercial product. While some external organizations (Gryphon Scientific, SecureBio) contributed to biosecurity evaluations, the vast majority of evaluations are entirely internal with no independent verification."
    315     },
    316     {
    317       "flag": "Only internal baselines",
    318       "detail": "All comparison baselines are OpenAI's own models (o3, o4-mini, GPT-4o). No external competing models are included, making it impossible to contextualize performance within the broader landscape."
    319     },
    320     {
    321       "flag": "No raw data or reproduction path",
    322       "detail": "No evaluation code, datasets, prompts, or raw data are released. Results cannot be independently verified or reproduced."
    323     },
    324     {
    325       "flag": "Contamination risk unaddressed",
    326       "detail": "Multiple public benchmarks are used (SWE-bench, BBQ, SimpleQA) without any discussion of whether the model was trained on this data."
    327     },
    328     {
    329       "flag": "Weak novice uplift study design",
    330       "detail": "The key safety claim — no meaningful novice uplift for bioweapons — rests on a study with 59 participants, no pre-registration, no IRB approval, no demographic reporting, and no clear inclusion criteria. The paper acknowledges limitations of the safety system but still concludes low risk."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "A StrongREJECT for Empty Jailbreaks",
    336       "authors": ["A. Souly", "Q. Lu", "D. Bowen"],
    337       "year": 2024,
    338       "arxiv_id": "2402.10260",
    339       "relevance": "Benchmark for evaluating jailbreak resistance in LLMs, directly relevant to AI safety evaluation methodology."
    340     },
    341     {
    342       "title": "BBQ: A hand-built bias benchmark for question answering",
    343       "authors": ["A. Parrish", "A. Chen", "N. Nangia"],
    344       "year": 2021,
    345       "arxiv_id": "2110.08193",
    346       "relevance": "Bias evaluation benchmark used to assess fairness in LLM outputs."
    347     },
    348     {
    349       "title": "Building an early warning system for LLM-aided biological threat creation",
    350       "authors": ["T. Patwardhan", "K. Liu", "T. Markov"],
    351       "year": 2023,
    352       "relevance": "Foundational work on evaluating biological risks from LLMs, directly relevant to AI safety methodology."
    353     },
    354     {
    355       "title": "Lab-Bench: Measuring capabilities of language models for biology research",
    356       "authors": ["J. M. Laurent", "J. D. Janizek"],
    357       "year": 2024,
    358       "relevance": "Benchmark for evaluating LLM capabilities in biology, relevant to dual-use risk assessment."
    359     },
    360     {
    361       "title": "Introducing SWE-bench Verified",
    362       "authors": ["N. Chowdhury", "J. Aung", "C. J. Shern"],
    363       "year": 2024,
    364       "relevance": "Key benchmark for evaluating AI coding agents on real-world software engineering tasks."
    365     },
    366     {
    367       "title": "PaperBench: Evaluating AI's ability to replicate AI research",
    368       "authors": ["G. Starace", "O. Jaffe", "D. Sherburn"],
    369       "year": 2025,
    370       "relevance": "Benchmark evaluating AI agents' ability to replicate research papers, relevant to AI capability assessment."
    371     }
    372   ]
    373 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs