calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (16297B)
      1 {
      2   "paper_slug": "advevomarl-shaping-internalized-2025",
      3   "calibration_date": "2026-02-28",
      4   "calibration_model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 50,
      7   "disagreement_count": 0,
      8   "agreement_rate": 1.0,
      9   "disagreements": [],
     10   "opus_checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, code archive, or any mention of code release anywhere in the paper."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Evaluation uses publicly available benchmarks: MATH-500, JailbreakBench, Wild Jailbreak, Strong Reject, AIME, LiveCodeBench, and GPQA-diamond. The custom Dadv training set (~4,000 samples) is not released, but the evaluation datasets are all standard public benchmarks."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, conda file, or detailed environment specification. The paper only mentions using QWen2.5 instruction-tuned models (3B and 7B) without library versions or environment setup details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. The method is described conceptually but no actionable instructions for replication exist."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "All results in Table 1 and Figures 2-5 are point estimates. No confidence intervals, error bars, or ± notation are reported anywhere."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative claims such as 'consistently achieves the lowest ASR' and '12% reduction' but no statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are reported."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No formal effect size measures (Cohen's d, odds ratios) are reported. While percentage differences are mentioned in text (e.g., '12% reduction in ASR'), these are not systematically computed with baseline context across all comparisons."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No justification for the choice of 4,000 training samples, 1,000 harmful behaviors, 300 adversarial prompts, or benchmark sizes. No power analysis or reasoning for chosen N values."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "All results appear to be single-run numbers. No standard deviation, variance across seeds, interquartile range, or multiple-run spread measures are reported."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares against Vanilla QWen2.5 (3B and 7B), Challenger (self-verification), Inspector (external guard agent), and closed-source GPT-3.5 and GPT-4o-mini as reference models."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Baselines are contemporary: Challenger from Huang et al. 2025, Inspector approach, QWen2.5 models, and GPT-4o-mini — all from 2024-2025, appropriate for this 2025 paper."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 5.3 ablates dynamic vs. static attacker and individual vs. joint (MAS) defender training. Section 5.4 ablates the public baseline mechanism against a no-baseline variant with training dynamics shown in Figure 5."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Three metrics are used: Attack Success Rate (ASR), Contagion Rate (CR), and task performance (accuracy for math/reasoning, Pass@1 for coding), measuring both safety and utility."
     80       },
     81       "human_evaluation": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "The paper evaluates automated safety metrics (ASR, CR) and task accuracy via automated benchmarks. Human evaluation of system outputs is not clearly relevant to the claims about attack success rates and task accuracy."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Training uses MATH-500 for defenders and custom adversarial pool for attackers. Evaluation uses separate benchmarks: AIME, GPQA-diamond, LiveCodeBench for task evaluation; JailbreakBench, Wild Jailbreak, Strong Reject, and NetSafe official dataset for safety evaluation."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Table 1 provides detailed breakdown by attack scenario (NetSafe, AutoInject, UserHijack) and by system topology (chain, tree, complete), giving 21 distinct evaluation conditions per model."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper acknowledges that CR remains elevated (up to ~35%) in complete graph topology even for AdvEvo-MARL, the 3B variant shows a maximum 3% accuracy drop, and Challenger-7B sometimes performs worse than vanilla (38.33% ASR vs. 22.29% for vanilla in UserHijack complete)."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 5.4 reports that the no-baseline variant exhibits 'non-stationary behavior and even degraded performance in later stages' with a 13.3% drop in defender response length. The 3B model's accuracy drop of up to 3% is also reported."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims 'ASR below 20%' — Table 1 shows maximum ASR of 17.68% for AdvEvo-MARL-3B in complete topology. 'baselines reach up to 38.33%' — Table 1 confirms Challenger-7B at 38.33% in UserHijack complete. '+3.67% on reasoning tasks' — approximately consistent with Figure 2, though this exact number is not reported in a table."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Causal claims are made via ablation studies with single-variable manipulation: dynamic vs. static attacker (Section 5.3), individual vs. joint training (Section 5.3), and with vs. without public baseline (Section 5.4). These controlled comparisons adequately support the causal attributions."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The conclusion describes AdvEvo-MARL as 'a promising and unified framework for building safe and capable multi-agent systems' without bounding to the tested conditions: QWen2.5 3B/7B only, three specific attack types, three topologies, and one model family. The title and abstract also make broad claims not bounded to the experimental scope."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No alternative explanations are discussed. For example, the 7B model's improved task performance after safety training could be due to general RL training benefits rather than safety-specific mechanisms. The paper does not consider confounds or alternative interpretations."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper uses 'QWen2.5 instruction-tuned models (3B and 7B)' without specifying exact checkpoint names (e.g., 'Qwen2.5-7B-Instruct' with a version hash). GPT-3.5 and GPT-4o-mini comparison models also lack snapshot dates or API version identifiers."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No actual prompt text is provided. System prompts for defenders, adversarial prompts for attackers, and evaluation prompts for the LLM-as-judge are all described only in natural language without the actual text used."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "Reward weights are partially reported (alpha_s=1, beta_t=0.5 in first half, reversed in second; format rewards 0.5/-0.1; safety/task rewards 1/-1). However, critical RL hyperparameters are missing: learning rate, batch size, number of training steps/epochs, clipping parameter epsilon, KL coefficient beta, and sampling temperature."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The multi-agent scaffolding is described in detail in Section 4 and Figure 1: three agent topologies (chain, tree, complete), attacker warm-up via SFT, adversarial co-evolutionary RL loop with REINFORCE++, public baseline for advantage estimation, and separate reward mechanisms for attackers vs. defenders."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 4.2 documents the preprocessing pipeline: sample 1,000 harmful behaviors from public datasets → apply jailbreak strategies to get Dinit → use reasoning model to synthesize multi-step traces → filter with LLM-as-judge removing contradictory, off-topic, or vague trajectories → resulting in ~4,000 samples in Dadv."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. The Ethics Statement (Section 7) discusses social implications but not methodological limitations. The conclusion contains no limitations discussion."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No threats to validity are discussed anywhere in the paper. Issues such as small model sizes (3B/7B only), limited attack types, single-run results, potential overfitting to specific attack strategies, and single model family evaluation are not addressed."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do NOT show. The conclusion broadly claims 'promising and unified framework for building safe and capable multi-agent systems' without bounding to the tested conditions (specific models, attack types, topologies)."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw experimental data, model outputs, evaluation logs, or detailed results in downloadable format are provided. Only aggregated metrics in tables and figures are presented."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 4.2 describes training data collection: sampling 1,000 harmful behaviors from public datasets, applying jailbreak strategies, synthesizing reasoning traces, and filtering. Section 5.1 describes evaluation datasets sourced from JailbreakBench, Wild Jailbreak, Strong Reject, and standard task benchmarks."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants in this study. All data comes from automated benchmarks and synthetic adversarial prompt construction."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The data pipeline from seed prompts to final training data is documented with counts: 1,000 harmful behaviors → jailbreak strategy application → reasoning trace synthesis → LLM-as-judge filtering → ~4,000 samples. Each step and transition is described."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No acknowledgments section, no funding disclosure, no mention of grants or corporate sponsors anywhere in the paper."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are listed on the first page: Northwestern University, University of Illinois at Chicago, University of Rochester, and Carnegie Mellon University."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No funding is disclosed, so funder independence cannot be assessed. Without funding information, this criterion is not evaluable."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement, patent disclosure, or declaration of financial interests appears anywhere in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "QWen2.5 instruction-tuned models are used but their training data cutoff dates are not stated. GPT-3.5 and GPT-4o-mini comparison models also lack training cutoff information."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "AIME, GPQA-diamond, and LiveCodeBench are used for evaluation but no discussion of whether QWen2.5's pre-training data may have included these benchmarks or their solutions."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "LiveCodeBench is cited as 'contamination free' (Jain et al., 2024), but GPQA-diamond and AIME are public benchmarks whose solutions could appear in QWen2.5's training data, and this contamination risk is not addressed for these benchmarks."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants. The Ethics Statement (Section 7) discusses social implications of the research, not IRB approval."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper proposes a method requiring multi-agent RL training of LLMs but reports no inference cost, API cost, tokens consumed, or wall-clock time for training or evaluation."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No GPU hours, hardware specifications, total API spend, or training time are stated anywhere in the paper."
    281       }
    282     }
    283   }
    284 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs