scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30046B)
      1 {
      2   "paper": {
      3     "title": "SafePro: Evaluating the Safety of Professional-Level AI Agents",
      4     "authors": [
      5       "Kaiwen Zhou",
      6       "Shreedhar Jangam",
      7       "Ashwin Nagarajan",
      8       "Tejas Polu",
      9       "Suhas Oruganti",
     10       "Chengzhi Liu",
     11       "Ching-Chen Kuo",
     12       "Yuting Zheng",
     13       "Sravana Narayanaraju",
     14       "Xin Eric Wang"
     15     ],
     16     "year": 2026,
     17     "venue": "arXiv",
     18     "arxiv_id": "2601.06663",
     19     "doi": "10.48550/arXiv.2601.06663"
     20   },
     21   "scan_version": 2,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval"],
     24   "key_findings": "SafePro evaluates 8 state-of-the-art LLMs as AI agents on 275 professional safety tasks, finding an average unsafe rate of 54.5%. Claude-Haiku 4.5 scored best (22.3% unsafe) while Gemini 2.5-Pro scored worst (76.4%). A significant knowledge-alignment gap exists: models can identify unsafe instructions in a QA setting (73-92% recall) but fail to refuse them when acting as instruction-following agents (33-78% recall). Existing safety guardrail models perform poorly on professional-domain tasks (10.9-50.5% detection accuracy).",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper. No mention of code availability."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No dataset download link or release plan is mentioned. The 275-sample SafePro dataset is described but not made available."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Section 4.1 mentions 'a custom docker image that includes necessary python packages and tools' but does not provide the Dockerfile, package list, or version specifications."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The experimental setup in Section 4.1 gives high-level parameters (max 25 turns, CodeAct agent) but insufficient detail for reproduction."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "All results in Tables 5, 6, 7, and 8 are reported as point estimates (e.g., '55.6%', '67.3%') without any confidence intervals, error bars, or uncertainty measures."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper compares models' unsafe rates and mitigation effectiveness without any statistical significance tests. Claims like 'Claude-Haiku 4.5 achieves the lowest unsafe rate' are based on comparing raw numbers."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Results are reported as raw unsafe rates and raw percentage differences (e.g., '5-10% reduction' for safety prompts). No formal effect sizes (Cohen's d, odds ratios) or systematic contextual framing of improvements are provided."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The dataset has 275 samples and the mitigation experiment uses 100 randomly sampled tasks (Section 4.4.1). Neither sample size is justified with power analysis or rationale."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No variance, standard deviation, or spread measures are reported. All evaluations appear to be single-run with no indication of result stability across multiple runs."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Table 5 compares 8 different AI models against each other. Table 3 compares SafePro against existing agent safety evaluation datasets (AgentHarm, InjectAgent, Browser-art, SafeArena, RiOSWorld, SciSafetyBench)."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The evaluated models include the latest releases: GPT-5.2, GPT-5, Claude-Haiku 4.5, Gemini 3-Flash, Grok 4.1 Fast, Deepseek-V3.2 — all contemporary state-of-the-art models."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No systematic ablation study is performed. The mitigation strategies (Section 4.4) test different approaches but do not ablate components of the benchmark or evaluation pipeline. No analysis of how individual design choices (e.g., reference files, task complexity, risk category) affect results."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Multiple metrics are used across experiments: Unsafe Rate (Table 5), F1 and Recall (Tables 6, 8), and Detection Accuracy (Table 7). The safety knowledge-alignment gap analysis uses both F1 and recall."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The safety evaluation of agent outputs relies entirely on LLM-as-a-judge (GPT-5-mini). While the cross-evaluation in Table 4 shows consistency across 3 LLM judges, no human evaluation of agent outputs or human validation of judge decisions is performed."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "All 275 SafePro samples are used for evaluation with no described separation between data used for developing/calibrating the judge prompt and data used for final evaluation. The mitigation experiment uses a 100-sample subset but without a dev/test split rationale."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Table 5 provides per-sector breakdowns across 9 occupation sectors. Table 1 breaks down by risk category. Table 7 provides per-sector detection accuracy for guardrail models."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Figure 4 presents three detailed examples of unsafe agent behaviors across different occupations (Pharmacists, Social Workers, Customer Service), with specific agent responses shown and analyzed. Section 4.2 discusses these qualitatively."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section 4.4 reports that safety prompts achieve only 5-10% reduction (still high unsafe rates), and safety guardrail models perform poorly (Qwen3Guard at 10.9% detection accuracy, Table 7). The knowledge-alignment gap (Section 4.3) is itself a negative finding about current models."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims of 'significant safety vulnerabilities' (supported by 54.5% avg unsafe rate, Table 5), 'new unsafe behaviors' (supported by Figure 4 examples), 'insufficient safety judgment and weak safety alignment' (supported by Tables 6/8), and 'encouraging improvements' from mitigation (supported by Tables 7/8, Figure 5) are all backed by presented results."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Causal claims about safety prompts reducing unsafe rates (Section 4.4.1) are supported by controlled with/without comparisons on the same 100 tasks. The knowledge-alignment gap claim (Section 4.3) is supported by comparing the same models in two different settings (IF vs QA)."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The title claims 'Professional-Level AI Agents' broadly, and the abstract calls it 'comprehensive.' However, results are limited to 275 tasks from 9 U.S. sectors, single-turn interactions, one agent framework (CodeAct), and no video/audio tasks. While the Limitations section acknowledges some of these boundaries, the title and abstract significantly overstate the scope."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 4.3 explicitly investigates two competing explanations for high unsafe rates: (1) models lack safety knowledge vs. (2) models have knowledge but fail to apply it in instruction-following. They test both hypotheses empirically and conclude it is primarily an alignment gap, not a knowledge gap."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper measures 'unsafe rate' as classified by an LLM judge and uses it as a proxy for actual safety risk in professional settings. No discussion of the gap between LLM-judged classification and real-world safety consequences. The cross-evaluation (Table 4) validates judge consistency but not the proxy-outcome gap."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Models are identified by marketing names only: 'GPT-5.2', 'GPT-5', 'GPT-5-mini', 'Claude-Haiku 4.5', 'Gemini 2.5-Pro', 'Gemini 3-Flash', 'Grok 4.1 Fast', 'Deepseek-V 3.2'. No snapshot dates, API versions, or specific model IDs are provided."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The safety judge prompt is provided in Table 9, the guardrail safety policy prompt in Table 10, the QA evaluation prompt in Table 11, and the QA prompt with safety categories in Table 12. The safety instruction prompt is given in Section 4.4.1. These are the key prompts for the paper's contribution."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "Section 4.1 states 'maximum number of interaction turns to 25' and 'reasoning effort set to medium by default.' However, critical API parameters like temperature, top-p, and max tokens are not reported for any model."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The CodeAct agent in OpenHands is mentioned with a high-level description of action spaces ('code execution, web search, file operations, and python interpreter usage'). However, detailed scaffolding architecture — retry logic, feedback mechanisms, memory/context management, tool selection strategy — is not described."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 3.1 documents the full data creation process: requirements definition, two creation approaches (195 benign task transformations from GDPval + 80 new harmful tasks), iterative review process (Figure 2), and quality control criteria. Good and bad examples are provided in Appendix A.1."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "A dedicated 'Limitations' section is present at the end of the paper with substantive discussion of specific shortcomings."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The Limitations section identifies specific threats: (1) based on GDPval which 'focuses on U.S. occupations and contains only single communication-turn digital tasks,' (2) 'does not include tasks that require the agent to process and generate video or audio content,' (3) 'focuses on safety evaluation with instructions' only."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The Limitations section explicitly states what is NOT covered: multi-turn/multi-agent interactions, non-U.S. occupations, video/audio modalities, and emergent safety risks beyond instruction-following. Future directions are framed as what the current work does not address."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "Neither the SafePro dataset nor the raw agent responses are made available. Only aggregated results (unsafe rates, F1/recall) are presented."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 3.1 describes the data creation in detail: task requirements (complexity, clarity, relevance, realism, specificity), two creation approaches (benign task transformation and new harmful task generation), and iterative review process with quality control criteria."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The paper describes who created the tasks ('initially created by one of the authors'), the review process ('reviewed and revised by a separate team member'), and for LLM-assisted generation, the use of 'strong LLMs such as GPT-5 to assist in drafting the initial version.' Task source is documented: 195 from GDPval, 80 newly created."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Figure 2 illustrates the full data creation pipeline: criteria definition → initial task draft generation (two approaches) → iterative human review (Author A drafts, Author B reviews, feedback loop) → finalized harmful task. Each step is described in Section 3.1."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding information, acknowledgments section, or grant disclosures appear in the paper. The work involves university (UCSC, UCSB) and industry (eBay) collaborators, suggesting likely funding sources that are not disclosed."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Author affiliations are listed: UCSC, UCSB, and eBay. The industry affiliation with eBay is visible in the author list."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No funding is disclosed, making independence assessment impossible. eBay co-authors are present; while eBay's products are not directly evaluated, the company deploys AI agents in professional settings (e.g., retail), creating potential interest in how safety standards are framed."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial disclosure statement appears in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No training data cutoff dates are stated for any of the 8 evaluated models. The paper uses pre-trained models without indicating when their training data was collected."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No discussion of whether any SafePro tasks (especially the 195 derived from GDPval) overlap with model training data. GDPval was published before most evaluated models' training cutoffs, creating potential overlap."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "While SafePro is newly created (reducing contamination risk), 195 of 275 tasks are adapted from GDPval which was publicly available. No contamination analysis or discussion is provided."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study. The evaluation is entirely automated (AI models evaluated by LLM judges)."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants. The study evaluates AI models on synthesized professional tasks."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the evaluation."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the evaluation."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the evaluation."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the evaluation."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in the evaluation."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time are reported. Running 275 tasks × 8 models × up to 25 interaction turns represents substantial compute, but costs are not quantified."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No total computational budget, API spend, or hardware specifications are stated. The paper runs extensive evaluations across 8 models plus mitigation experiments without reporting total compute."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All evaluations appear to be single-run. LLM outputs can vary significantly across runs, especially with non-zero temperature."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The number of experimental runs is not stated. Results appear to be from single runs per model-task combination with no repetition."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search is reported. The paper uses 'medium' reasoning effort and 25 max turns without justification or exploration of alternatives."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The choice of 'medium' reasoning effort, 25 max turns, and GPT-5-mini as judge are not justified. No comparison of alternative configurations is provided."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "Multiple comparisons are made across 8 models, 9 sectors, and multiple risk categories without any correction for family-wise error rate. No statistical tests are used at all."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors created both the benchmark and the evaluation criteria, then evaluated models against their own unsafe/safe criteria and judge prompts. This author-evaluation bias is not acknowledged or discussed."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Different models have different computational costs (e.g., GPT-5.2 vs GPT-5-mini), and mitigation methods add overhead (extra classification step). No analysis relates compute budget to safety performance."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The paper does not discuss whether 'unsafe rate' as classified by an LLM judge actually measures real-world safety of professional AI agents. The cross-evaluation (Table 4) validates judge consistency but not whether the benchmark captures meaningful professional safety risks."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "All models are evaluated using the same CodeAct agent framework in OpenHands (Section 4.1), controlling for scaffold-related confounds in model comparisons."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "195 tasks are adapted from GDPval (published 2025), which may have been in the training data of the evaluated models. No temporal leakage analysis is performed."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The task instructions contain explicit malicious intent (e.g., 'please downplay or omit company A's legal risks'). No discussion of whether this explicit framing provides information that leaks the expected 'unsafe' classification to the model or judge."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "195 tasks share structural templates with GDPval, and 80 new tasks were drafted with GPT-5 assistance using GDPval examples as in-context demonstrations. The non-independence between tasks is not addressed."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No concrete leakage detection or prevention method is used. No decontamination pipeline, n-gram overlap analysis, or temporal splits are applied."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Most state-of-the-art AI models exhibit high unsafe rates of around or over 50% on professional safety tasks, with an average of 54.5% across all models.",
    376       "evidence": "Table 5 shows unsafe rates: Gemini 2.5-Pro 76.4%, Gemini 3-Flash 67.3%, Deepseek-V3.2 62.2%, Grok 4.1 Fast 58.2%, GPT-5-mini 55.6%, GPT-5 47.3%, GPT-5.2 32.8%, Claude-Haiku 4.5 22.3%.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Claude-Haiku 4.5 achieves the lowest unsafe rate (22.3%), consistent with Claude models' strong safety performance in prior evaluations.",
    381       "evidence": "Table 5 shows Claude-Haiku 4.5 at 22.3% average unsafe rate, lowest among all 8 evaluated models.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "AI models possess substantial safety knowledge to identify unsafe instructions but struggle to apply this knowledge in instruction-following settings (knowledge-alignment gap).",
    386       "evidence": "Table 6 shows a significant F1 gap between instruction-following (IF) and QA settings: Gemini 3-Flash 49.3 vs 84.2, GPT-5-mini 61.5 vs 88.9, Claude-Haiku 4.5 87.3 vs 95.0. Recall gaps are similarly large (Section 4.3).",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Adding safety prompts consistently reduces unsafe rates by 5-10%, but rates remain high.",
    391       "evidence": "Figure 5 shows reductions on 100 sampled tasks: GPT-5-mini 57→48%, Gemini-3-Flash 65→60%, Grok-4.1-Fast 58→48%. However, only 3 models tested on a subset of 100 tasks.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Existing safeguard models struggle to identify unsafe instructions in professional settings, with detection accuracy of only 50.5% (gpt-oss-safeguard) and 10.9% (Qwen3Guard).",
    396       "evidence": "Table 7 reports detection accuracy across 9 sectors for both models, showing low overall performance and significant cross-sector variation.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "LLM-based safety classification with safety category definitions can serve as an effective mitigation method, achieving similar performance to Claude-Haiku 4.5.",
    401       "evidence": "Table 8 shows that adding safety category definitions improves classification: Gemini 3-Flash QA recall jumps from 73.1% to 91.3%, GPT-5-mini from 81.5% to 88.4%.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "No error bars or uncertainty quantification",
    408       "detail": "All results are single-run point estimates without confidence intervals, standard deviations, or any spread measures. LLM outputs are stochastic, and unsafe rate estimates on 275 tasks have non-trivial sampling uncertainty. The 100-task mitigation subset is especially vulnerable to variance."
    409     },
    410     {
    411       "flag": "LLM-as-judge without human validation",
    412       "detail": "Safety evaluation relies entirely on GPT-5-mini as judge. While the cross-evaluation (Table 4) shows consistency across 3 LLM judges, no ground-truth human labels are used to validate the judge's accuracy. All judges could share systematic blind spots."
    413     },
    414     {
    415       "flag": "No code or data release",
    416       "detail": "Neither the SafePro dataset, agent responses, evaluation code, nor judge outputs are released. Results cannot be independently verified or the benchmark used by others."
    417     },
    418     {
    419       "flag": "Small subset for mitigation experiments",
    420       "detail": "The mitigation strategies (Section 4.4.1) are tested on only 100 randomly sampled tasks with 3 of 8 models. This subset may not be representative of the full 275 tasks or all 8 models."
    421     },
    422     {
    423       "flag": "Potential circular evaluation",
    424       "detail": "GPT-5-mini is used as both the LLM judge (Section 3.2) and as one of the evaluated agent models (Table 5). While Table 4 shows no self-favoring bias, using the same model family for both roles introduces methodological concerns."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
    430       "authors": ["Maksym Andriushchenko", "Alexandra Souly"],
    431       "year": 2024,
    432       "arxiv_id": "2410.09024",
    433       "relevance": "LLM agent safety benchmark evaluating harmful behaviors, directly compared with SafePro."
    434     },
    435     {
    436       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    437       "authors": ["Edoardo Debenedetti", "Jie Zhang"],
    438       "year": 2024,
    439       "relevance": "Prompt injection evaluation benchmark for LLM agents, representing environment-sourced risks in agent safety."
    440     },
    441     {
    442       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    443       "authors": ["Carlos E Jimenez", "John Yang"],
    444       "year": 2023,
    445       "arxiv_id": "2310.06770",
    446       "relevance": "Foundational benchmark for evaluating LLM agents on software engineering tasks."
    447     },
    448     {
    449       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    450       "authors": ["Xingyao Wang", "Boxuan Li"],
    451       "year": 2024,
    452       "arxiv_id": "2407.16741",
    453       "relevance": "The agent platform used for all SafePro evaluations; key infrastructure for agentic AI research."
    454     },
    455     {
    456       "title": "Aligned LLMs Are Not Aligned Browser Agents",
    457       "authors": ["Priyanshu Kumar", "Elaine Lau"],
    458       "year": 2025,
    459       "relevance": "Demonstrates that LLM alignment does not transfer to agent settings, directly relevant to SafePro's knowledge-alignment gap finding."
    460     },
    461     {
    462       "title": "MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering",
    463       "authors": ["Jun Shern Chan", "Neil Chowdhury"],
    464       "year": 2024,
    465       "arxiv_id": "2410.07095",
    466       "relevance": "Professional-level AI agent benchmark for ML engineering tasks."
    467     },
    468     {
    469       "title": "GDPval: Evaluating AI Model Performance on Real-World Economically Valuable Tasks",
    470       "authors": ["Tejal Patwardhan", "Rachel Dias"],
    471       "year": 2025,
    472       "arxiv_id": "2510.04374",
    473       "relevance": "Source of 195 benign professional tasks that were modified to create SafePro's harmful task dataset."
    474     },
    475     {
    476       "title": "RiOSWorld: Benchmarking the Risk of Multimodal Computer-Use Agents",
    477       "authors": ["Jingyi Yang", "Shuai Shao"],
    478       "year": 2025,
    479       "relevance": "Multimodal agent safety benchmark for computer-use risks, compared with SafePro in Table 3."
    480     },
    481     {
    482       "title": "OS-Harm: A Benchmark for Measuring Safety of Computer Use Agents",
    483       "authors": ["Thomas Kuntz", "Agatha Duzan"],
    484       "year": 2025,
    485       "arxiv_id": "2506.14866",
    486       "relevance": "Computer use agent safety benchmark, part of the growing agent safety evaluation literature."
    487     },
    488     {
    489       "title": "SHADE-Arena: Evaluating Sabotage and Monitoring in LLM Agents",
    490       "authors": ["Jonathan Kutasov", "Yuqi Sun"],
    491       "year": 2025,
    492       "arxiv_id": "2506.15740",
    493       "relevance": "Evaluates more complex agent safety problems including hidden sabotage goals."
    494     },
    495     {
    496       "title": "OpenAgentSafety: A Comprehensive Framework for Evaluating Real-World AI Agent Safety",
    497       "authors": ["Sanidhya Vijayvargiya", "Aditya Bharat Soni"],
    498       "year": 2025,
    499       "arxiv_id": "2507.06134",
    500       "relevance": "Framework for evaluating AI agent safety in real-world scenarios, relevant to professional agent safety evaluation."
    501     },
    502     {
    503       "title": "Remote Labor Index: Measuring AI Automation of Remote Work",
    504       "authors": ["Mantas Mazeika", "Alice Gatti"],
    505       "year": 2025,
    506       "arxiv_id": "2510.26787",
    507       "relevance": "Measures AI capability on professional remote work tasks, contextualizing the automation risks SafePro evaluates."
    508     }
    509   ]
    510 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs