ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25802B)


      1 {
      2   "paper": {
      3     "title": "Measuring Mid-2025 LLM-Assistance on Novice Performance in Biology",
      4     "authors": ["Shen Zhou Hong", "Alex Kleinman", "Alyssa Mathiowetz", "Adam Howes", "Julian Cohen", "Suveer Ganta", "Alex Letizia", "Dora Liao", "Deepika Pahari", "Xavier Roberts-Gaal", "Luca Righetti", "Joe Torres"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.16703"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": true,
     16         "justification": "Analysis code is publicly available at https://github.com/panoplia/PAN-2025-001-code with a README and environment specification (Section 4.13). Randomization code also released at https://github.com/panoplia/randomization."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "The pre-registration appendix and statistical analysis plan are publicly accessible, but the paper does not indicate that the raw participant-level outcome data or LLM chat logs are publicly released."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Section 4.13 states the code repository contains 'a README and environment specification, as well as instructions for reproducing the study's analyses.'"
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Section 4.13 states the repository contains 'instructions for reproducing the study's analyses.' R version 4.5.1 is specified (Section 4.10)."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "95% confidence intervals are reported for all primary and secondary outcomes (Table 8), Kaplan-Meier curves (Figure 4), RMST analyses (Table 10), and 95% credible intervals for Bayesian models (Figure 3)."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "One-sided Fisher's exact tests used for primary and secondary outcomes (Table 8). Log-rank tests for time-to-completion (Table 11). Likelihood ratio tests for subgroup analyses (Table 15)."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Risk ratios, risk differences, and odds ratios reported throughout (Table 8, Figure 3, Figure 5b). Cohen's d reported for balance tests (Tables 4-7). Bayesian posterior risk ratios with credible intervals provided."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 4.10 describes a pre-study power analysis with assumed success rates, correlation structure, target power (90%), and resulting sample size (~75 per arm, n=150). Post-hoc power calculations also provided in Discussion."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Standard deviations reported for demographics and survey measures (Tables 4-7, 17). Credible intervals from posterior distributions capture uncertainty across Bayesian models. IQR reported for multiple measures."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The Internet arm serves as the control/baseline condition against which the LLM arm is compared. This is the core RCT design."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The control condition (internet access without LLMs) is a meaningful contemporary baseline for assessing LLM uplift. The LLM arm included mid-2025 frontier models (Opus 4, GPT-5, Gemini 2.5)."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "This is an RCT with a single treatment factor (LLM access vs. internet only). There is no multi-component system to ablate."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Multiple outcome measures: binary task success, milestone achievement, procedural step progression (ordinal), time-to-completion (RMST), number of attempts, and subgroup analyses. Both frequentist and Bayesian analyses."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Task outcomes were assessed by study staff using objective laboratory measurements (cell counts, sequencing, qPCR) with investigator blinding. Samples were mixed across arms and batched before evaluation (Section 4.3)."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "This is an RCT, not a benchmark evaluation. The concept of held-out test sets does not apply."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Results broken down by individual task (5 tasks), by procedural step within each task (Figure 9, Table 16), by FAS vs PPS populations, and by subgroup (Table 15)."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Discussion addresses specific failure modes: LLMs generated incorrect DNA sequences and reagent recommendations for molecular cloning (Section 2.6). Figure 11 shows LLM participants submitted first requests faster but correct requests at similar rates."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The primary outcome was null (no significant difference, P=0.759). The paper prominently reports this negative finding and that LLM usage intensity did not predict success (Table 15). LLM users' confidence in LLM helpfulness decreased over time."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Abstract claims are well-hedged and match results: 'no significant difference in the primary endpoint' (Table 8, P=0.759), 'numerically higher success rates in four of five tasks' (Table 8), '1.4-fold increase' (Figure 3), 'posterior probability of a positive effect: 81%–96%' (Figure 5b)."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The RCT design with randomization (Section 4.3, NIST Randomness Beacon), investigator blinding, balanced baseline characteristics (Tables 3-7), and pre-registration supports causal inference. The paper also explicitly acknowledges design confounds limiting causal inference for individual tasks (Discussion)."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Discussion extensively bounds generalization: results are for 'mid-2025 LLMs', novice users only, simplified/decoupled tasks, and may not extend to expert users or complete workflows. The paper explicitly states results should not be used as 'worst-case frontier risk' estimates."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Discussion presents multiple alternative explanations: benchmark-reality gap, elicitation failure, expertise-dependent performance gap, and task-dependent LLM utility. Also discusses design confounds (parallel task attempts, prerequisite gating, staggered release)."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper carefully frames task completion as modeling a reverse genetics workflow rather than claiming to directly measure biosecurity risk. Discussion explicitly states results cannot be extrapolated to 'independently establish a functional laboratory or execute a complete reverse genetics project.'"
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "Section 4.4.2 lists model families (Opus 4, Sonnet 4, o3, GPT-4o, GPT-5, Gemini 2.5) but does not provide specific API versions or snapshot dates. Marketing names without version identifiers are used throughout."
    140       },
    141       "prompts_provided": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "Participants freely prompted LLMs in an unstructured manner — the study measured naturalistic LLM use, not researcher-designed prompts. There are no researcher-specified prompts to disclose."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "Participants used LLMs via standard consumer interfaces (ChatGPT, Claude, Gemini), not API calls with researcher-controlled parameters. Hyperparameters were not under experimental control."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding was used. Participants interacted directly with LLM chatbots via standard consumer interfaces."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Data collection procedures described in detail (Section 4.9): REDCap for outcomes, Veriato and Webtime Tracker for usage data, Plasmidsaurus for sequencing. CONSORT diagram (Figure 2b) shows participant flow. FAS vs PPS criteria defined (Section 4.5)."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Discussion contains an extensive limitations paragraph covering experimental design limitations, external validity challenges, and statistical power limitations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Specific threats discussed: decoupled tasks vs. integrated workflows, simplified molecular cloning, parallel task attempts creating confounds, underpowered primary outcome (observed 36% power for OR=2.0), evolving model capabilities limiting external validity, and design confounds for individual task causal inference."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Paper explicitly states: results 'may not be extrapolated to conclude that participants could independently establish a functional laboratory,' should not be used as 'worst-case frontier risk' estimates, and that novice elicitation ability may evolve. Also notes exclusion of material acquisition and infrastructure setup."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The pre-registration appendix and analysis code are released, but participant-level outcome data, LLM chat logs, and laboratory measurements do not appear to be publicly available."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 4.9 describes data collection via REDCap, Google Drive, Veriato monitoring, Webtime Tracker, and Plasmidsaurus sequencing. Sections 4.6.1-4.6.5 detail specific laboratory instruments and outcome measures."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 4.2 describes recruitment from the greater Boston area through 'advertisements at local universities, on social media, on recruitment platforms, and at community events' with specific eligibility criteria (age ≥18, English proficiency, ≤2 weeks prior lab experience)."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "CONSORT diagram (Figure 2b) shows full participant flow: 153 randomized → 128 PPS (25 excluded for attendance). Outcome assessment procedures described with blinding protocol. Analysis code publicly available. SAP registered prior to unblinding."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 5 states: 'This work was supported by grants from the Frontier Model Forum, Sentinel Bio, and the David and Lucile Packard Foundation.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations listed: Active Site, Independent, and Model Evaluation and Threat Research, Inc. Acknowledgments list advisory board members from Anthropic, OpenAI, Google DeepMind, Microsoft, Meta, Amazon — all disclosed."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "The Frontier Model Forum is an industry consortium of frontier AI companies (Google, OpenAI, Anthropic, Microsoft) that has a stake in biosecurity risk assessments of LLMs. Section 4.1 notes 'Two funders (Frontier Model Forum and Sentinel Bio) had one representative each on the advisory board' and 'Funders were consulted on aspects of study design.'"
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Section 7 states: 'The authors do not declare any competing interests.'"
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This RCT does not evaluate a pre-trained model's capability on a benchmark. It measures human performance with LLM assistance in physical laboratory tasks."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "No benchmark evaluation of model knowledge. The study measures human task performance, not model accuracy on a test set."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No benchmark evaluation. The study is an RCT measuring human laboratory performance."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Pre-registered at AsPredicted.org (#235922) prior to participant activities. Statistical analysis plan separately registered (#249463) after study start but prior to unblinding (Section 4.1)."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "Section 4.1: 'The protocol and all amendments were approved by the Advarra Institutional Review Board (Pro00085300).'"
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Extensive demographics in Tables 4-7 and Figure 2a: age, sex, race/ethnicity, education level, field of study, prior biology experience (with subscores), LLM experience, nonverbal reasoning (Raven's), self-efficacy, and perfectionism measures."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "Section 4.2 specifies: age ≥18, available for study duration, sufficient English, ≤2 weeks prior hands-on lab experience in relevant areas. Exclusion: inability to complete safety training, refusal of randomization, or refusal to follow safety/study rules."
    260       },
    261       "randomization_described": {
    262         "applies": true,
    263         "answer": true,
    264         "justification": "Section 4.3: stratified simple randomization by session time (morning/afternoon), 1:1 allocation, conducted by independent statistician using R program with NIST Randomness Beacon, seed timestamps pre-registered, allocation uploaded to REDCap with concealment."
    265       },
    266       "blinding_described": {
    267         "applies": true,
    268         "answer": true,
    269         "justification": "Section 4.3 details investigator blinding: participants assigned non-sequential IDs and alliterative pseudonyms, data collection blinded except two unblinded sample couriers, samples mixed across arms and batched before evaluation, SAP developed before unblinding. Participants could not be blinded to their own condition."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "CONSORT diagram (Figure 2b): 153 randomized, 128 met PPS criteria, 25 did not (12 Internet, 13 LLM). Both FAS (intent-to-treat) and PPS analyses reported. Exit survey adherence data in Table 19."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "Token usage per participant is reported (Figure 6, average 312,696 tokens), but no monetary costs for LLM API access or total study operational costs are stated."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total computational budget, API spend, or hardware costs stated for the LLM usage or data analysis. Study operational costs (laboratory, equipment, compensation) are not reported."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "LLM access did not significantly increase completion of the core reverse genetics sequence (primary outcome).",
    293       "evidence": "5.2% LLM vs. 6.6% Internet (RR 0.79, 95% CI 0.24–2.62; P=0.759, Fisher's exact test). Table 8, Figure 3.",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "LLM access was associated with higher cell culture success in the per-protocol population.",
    298       "evidence": "PPS: 79.7% LLM vs. 62.5% Internet (RR 1.28, P=0.025). FAS: 68.8% vs. 55.3% (P=0.059). Table 8.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "Post-hoc Bayesian modeling estimates an approximate 1.4-fold increase in success for a typical reverse genetics task under LLM assistance.",
    303       "evidence": "Out-of-sample RR 1.42 (95% CrI 0.74–2.62, Pr(RR>1)=85.5%). Hierarchical participant-level model selected via LOO-CV. Figure 3, Table 9.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "LLM participants progressed further through procedural steps across all tasks.",
    308       "evidence": "Ordinal regression: posterior probability of positive effect 80.9%–96.4% across tasks. LLM arm higher completion at 21 of 22 monitored procedural steps. Figure 5, Figure 9.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "LLM arm achieved cell culture success approximately 6 days earlier.",
    313       "evidence": "RMST difference: −6.02 days (95% CI −11.05 to −0.98; P=0.02). Table 10, Figure 4b.",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "No subgroup showed substantially different treatment effects for cell culture.",
    318       "evidence": "Likelihood ratio tests for interaction: nonverbal reasoning P=0.90, prior biology P=0.27, prior LLM experience P=0.12, YouTube P=0.38. LLM usage intensity and image uploads did not predict success. Table 15.",
    319       "supported": "moderate"
    320     },
    321     {
    322       "claim": "There is a gap between in silico benchmark performance and real-world utility for novices.",
    323       "evidence": "LLMs outperform experts on VCT and LAB-Bench benchmarks (cited), but did not substantially increase novice completion of complex lab procedures in this RCT. LLM users' confidence in LLM helpfulness declined over the study (Figure 10F). YouTube rated more helpful than any single LLM.",
    324       "supported": "moderate"
    325     }
    326   ],
    327   "methodology_tags": ["rct"],
    328   "key_findings": "In the largest public RCT (n=153) evaluating LLM uplift for novice biological laboratory performance, access to mid-2025 frontier LLMs did not significantly improve completion of a model viral reverse genetics workflow (5.2% vs 6.6%, P=0.759). However, LLMs improved cell culture success in the per-protocol population (79.7% vs 62.5%, P=0.025) and enabled greater progression through procedural steps across all tasks. Bayesian modeling estimated a modest ~1.4-fold average uplift (95% CrI 0.74–2.62), revealing a substantial gap between LLM benchmark performance and real-world laboratory utility for novice users.",
    329   "red_flags": [
    330     {
    331       "flag": "Substantially underpowered primary outcome",
    332       "detail": "The study had only 36% power to detect OR=2.0 for the primary outcome due to much lower completion rates than anticipated (5-7% observed vs. 19-40% assumed). The authors acknowledge this but the null finding for the primary outcome is difficult to interpret given the low power."
    333     },
    334     {
    335       "flag": "Non-independent funders",
    336       "detail": "Frontier Model Forum (consortium of OpenAI, Google, Anthropic, Microsoft) funded the study and had advisory board representation. These companies have a potential interest in results showing LLMs pose limited biosecurity risk. However, the funders 'did not hold authority over design or implementation' (Section 4.1)."
    337     },
    338     {
    339       "flag": "Post-hoc primary analysis change",
    340       "detail": "The pre-registration specified a one-sided two-proportion z-test, but the SAP (finalized pre-unblinding) switched to Fisher's exact test due to low outcome counts. While justified statistically and registered pre-unblinding, this represents a deviation from the original pre-registration."
    341     },
    342     {
    343       "flag": "Simplified task design limits ecological validity",
    344       "detail": "Tasks were decoupled from an end-to-end workflow, material acquisition and infrastructure setup were excluded, and molecular cloning was simplified. The paper acknowledges these cannot be extrapolated to full reverse genetics capability."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    350       "authors": ["Joel Becker"],
    351       "year": 2025,
    352       "relevance": "METR RCT on LLM impact on developer productivity — key comparator RCT in the AI productivity space."
    353     },
    354     {
    355       "title": "Can Large Language Models Democratize Access to Dual-Use Biotechnology?",
    356       "authors": ["Emily H. Soice"],
    357       "year": 2023,
    358       "arxiv_id": "2306.03809",
    359       "relevance": "Early study on LLMs and biosecurity information access, predecessor to the current RCT."
    360     },
    361     {
    362       "title": "The Operational Risks of AI in Large-Scale Biological Attacks: Results of a Red-Team Study",
    363       "authors": ["Christopher A. Mouton", "Caleb Lucas", "Ella Guest"],
    364       "year": 2024,
    365       "doi": "10.7249/RRA2977-2",
    366       "relevance": "RAND red-team study on LLM-aided bioweapon risks, text-based evaluation this RCT extends to physical lab."
    367     },
    368     {
    369       "title": "Building an Early Warning System for LLM-aided Biological Threat Creation",
    370       "authors": ["Tejal Patwardhan"],
    371       "year": 2024,
    372       "relevance": "OpenAI study on LLM biological threat potential, predecessor human study in biosecurity evaluation."
    373     },
    374     {
    375       "title": "Measuring Skill-Based Uplift from AI in a Real Biological Laboratory",
    376       "authors": ["Ethan Obie Romero-Severson"],
    377       "year": 2025,
    378       "arxiv_id": "2512.10960",
    379       "relevance": "Concurrent pilot-scale study on LLM uplift in laboratory tasks, smaller predecessor to this work."
    380     },
    381     {
    382       "title": "Virology Capabilities Test (VCT): A Multimodal Virology Q&A Benchmark",
    383       "authors": ["Jasper Götting"],
    384       "year": 2025,
    385       "arxiv_id": "2504.16137",
    386       "relevance": "Key in-silico benchmark whose ecological validity this RCT directly challenges."
    387     },
    388     {
    389       "title": "LAB-Bench: Measuring Capabilities of Language Models for Biology Research",
    390       "authors": ["Jon M. Laurent"],
    391       "year": 2024,
    392       "arxiv_id": "2407.10362",
    393       "relevance": "Biology research benchmark for LLMs, another in-silico benchmark this RCT contrasts with real-world performance."
    394     },
    395     {
    396       "title": "AI Tutoring Outperforms In-Class Active Learning: An RCT",
    397       "authors": ["Greg Kestin"],
    398       "year": 2025,
    399       "doi": "10.1038/s41598-025-97652-6",
    400       "relevance": "RCT on AI tutoring effectiveness showing counterintuitive results vs. benchmarks, methodological comparator."
    401     },
    402     {
    403       "title": "The Levers of Political Persuasion with Conversational Artificial Intelligence",
    404       "authors": ["Kobi Hackenburg"],
    405       "year": 2025,
    406       "doi": "10.1126/science.aea3884",
    407       "relevance": "RCT on LLM persuasion capabilities, another domain where RCTs produced counterintuitive results vs. expectations."
    408     },
    409     {
    410       "title": "International AI Safety Report 2026",
    411       "authors": ["Yoshua Bengio"],
    412       "year": 2026,
    413       "relevance": "Major AI safety policy document highlighting biosecurity as key emerging risk, directly relevant to policy implications."
    414     },
    415     {
    416       "title": "Estimating Worst-Case Frontier Risks of Open-Weight LLMs",
    417       "authors": ["Eric Wallace"],
    418       "year": 2025,
    419       "arxiv_id": "2508.03153",
    420       "relevance": "Framework for worst-case risk estimation that this paper explicitly contrasts its findings against."
    421     },
    422     {
    423       "title": "The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning",
    424       "authors": ["Nathaniel Li"],
    425       "year": 2024,
    426       "arxiv_id": "2403.03218",
    427       "relevance": "Dual-use knowledge benchmark for LLMs relevant to biosecurity evaluation."
    428     }
    429   ]
    430 }

Impressum · Datenschutz