calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (19688B)
      1 {
      2   "paper_slug": "advancing-llm-safe-2025",
      3   "calibration_date": "2026-02-28",
      4   "total_questions": 50,
      5   "agreement_count": 47,
      6   "disagreement_count": 3,
      7   "agreement_rate": 0.94,
      8   "disagreements": [
      9     {
     10       "category": "evaluation_design",
     11       "question": "human_evaluation",
     12       "sonnet": {"applies": false, "answer": false},
     13       "opus": {"applies": true, "answer": false},
     14       "direction": "applies_boundary",
     15       "explanation": "Sonnet set applies=false, reasoning that automated evaluation against benchmark labels is appropriate and human evaluation is irrelevant. Opus set applies=true because safety classification has inherently subjective elements — the ground truth labels in this paper are derived from keyword matching ('Sorry', 'unable' = safe; 'sure', 'certainly' = harmful), not human annotation. Human evaluation of whether responses are truly safe or harmful is relevant to validating these proxy labels and the system's real-world utility. The criterion could reasonably be expected of this paper type, so applies=true, answer=false is more appropriate."
     16     },
     17     {
     18       "category": "setup_transparency",
     19       "question": "hyperparameters_reported",
     20       "sonnet": {"applies": true, "answer": true},
     21       "opus": {"applies": true, "answer": false},
     22       "direction": "sonnet_generous",
     23       "explanation": "Sonnet gave answer=true based on the ranker hyperparameters reported in Section 4.1 (learning rate 0.001, weight decay 0.0001, dropout 0.1, momentum 1.0, bottom 25% layers). However, the paper uses LLMs to generate candidate responses via 'stochastic decoding with moderate temperature' (Section 3.1) without specifying the actual temperature value, top-p, or max tokens. The schema explicitly states: 'If the paper uses an LLM API without stating temperature/sampling settings, NO — these significantly affect output.' The LLM sampling parameters are critical to reproducibility since they determine candidate diversity, which directly affects SRR's performance."
     24     },
     25     {
     26       "category": "conflicts_of_interest",
     27       "question": "funder_independent_of_outcome",
     28       "sonnet": {"applies": false, "answer": false},
     29       "opus": {"applies": true, "answer": false},
     30       "direction": "applies_boundary",
     31       "explanation": "Sonnet set applies=false reasoning that no funding is disclosed so independence cannot be assessed. The schema says 'NA if unfunded' — but the paper never states it is unfunded; it simply has no funding disclosure at all. An absent funding statement is different from a confirmed absence of funding. Since we cannot confirm the paper is unfunded, the criterion applies (the paper could have funding that is undisclosed), and the answer is false because independence cannot be verified without disclosure."
     32     }
     33   ],
     34   "opus_checklist": {
     35     "artifacts": {
     36       "code_released": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The abstract states 'Our code will be available upon publication.' Per the schema, a promise of future release counts as NO. No working URL or archive is provided."
     40       },
     41       "data_released": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper uses publicly available benchmarks: HarmBench, SorryBench, JailbreakBench, BBQ, Harmcopy, and MATH. Per the schema, 'If the data is a standard public benchmark they didn't modify, YES.' The custom train/test splits (50 per dataset for training) are not released, but the underlying datasets are public."
     45       },
     46       "environment_specified": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper lists ranker hyperparameters (learning rate, weight decay, dropout, momentum) but provides no requirements.txt, Dockerfile, or environment setup section with library versions. No framework versions (PyTorch, HuggingFace, etc.) are specified."
     50       },
     51       "reproduction_instructions": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No step-by-step reproduction instructions are provided. Algorithm 1 gives pseudocode but not actual commands or a README. Code is promised but not released."
     55       }
     56     },
     57     "statistical_methodology": {
     58       "confidence_intervals_or_error_bars": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "All results in Tables 1-8 are reported as single point estimates (accuracy percentages) with no confidence intervals, error bars, or +/- notation."
     62       },
     63       "significance_tests": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper claims 'our method greatly outperforms the reward model' but performs no statistical significance tests. No p-values, t-tests, or bootstrap tests are reported."
     67       },
     68       "effect_sizes_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper reports raw accuracy numbers in tables but does not report formal effect sizes (Cohen's d, odds ratios, relative risk, or percentage improvement). Showing two numbers side by side in a table constitutes raw differences, not effect size reporting per the schema."
     72       },
     73       "sample_size_justified": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper uses 50 examples per dataset for training and the rest for testing. No justification for this split size is given, and no power analysis is discussed."
     77       },
     78       "variance_reported": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "All results appear to be from single runs. No standard deviation, variance, or results across multiple random seeds are reported."
     82       }
     83     },
     84     "evaluation_design": {
     85       "baselines_included": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "A GPT-2-based reward model serves as the baseline for ranking accuracy (Tables 1-4). For real-world application (Tables 5-7), 'first accuracy' (choosing the highest-probability response) is the baseline."
     89       },
     90       "baselines_contemporary": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The primary baseline is GPT-2 (2019), which the authors acknowledge is '20 times larger than the ranker model' but is fundamentally outdated. Contemporary safety defense methods (SafeDecoding, LlamaGuard, self-reminder) are discussed in related work but not used as experimental baselines."
     94       },
     95       "ablation_study": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No ablation study is present. Key design choices — layer selection (bottom 25%), single-layer transformer encoder, listwise KL-divergence loss, cosine similarity scoring — are not individually evaluated."
     99       },
    100       "multiple_metrics": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Only ranking accuracy is used as an evaluation metric throughout all experiments (Tables 1-8). No secondary metrics such as attack success rate, refusal rate, false positive rate, or F1 are reported."
    104       },
    105       "human_evaluation": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No human evaluation of system outputs is conducted. Safety labels are derived from keyword matching ('Sorry', 'unable' = safe; 'sure', 'certainly' = harmful) rather than human annotation. Human evaluation of whether responses are truly safe is relevant given the limitations of keyword-based labeling as a safety proxy."
    109       },
    110       "held_out_test_set": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4.1 explicitly states: 'For each dataset, we extract 50 of them as the training dataset, and the rest is used as the testing dataset.' The train/test separation is clear."
    114       },
    115       "per_category_breakdown": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "Results are broken down by model and dataset but not by category within datasets. HarmBench contains prompts in 'various areas' (Section 4.1) and BBQ has multiple bias categories, but no within-dataset category breakdowns are provided."
    119       },
    120       "failure_cases_discussed": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "No failure case analysis or qualitative error examples are presented. The Limitations section mentions general areas for improvement but does not examine specific cases where SRR fails."
    124       },
    125       "negative_results_reported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The BBQ fairness result (52.52% average accuracy, with Vicuna at 50.64% — essentially random) is honestly reported as a negative result. Section 4.4 explicitly acknowledges the low performance."
    129       }
    130     },
    131     "claims_and_evidence": {
    132       "abstract_claims_supported": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The abstract claims SRR 'significantly improves robustness to adversarial prompts,' supported by Tables 5-7 showing improvements over baseline. The claim about selecting safe responses using hidden states is supported by the methodology and experimental results. Claims are appropriately hedged."
    136       },
    137       "causal_claims_justified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper makes causal claims such as 'SRR significantly enhances the safety alignment of LLMs' and 'SRR can effectively improve the safety mechanisms of LLMs.' While the controlled benchmark comparisons support the narrower claim that SRR ranks better than GPT-2 reward model, the broader causal claims about improving 'safety mechanisms' and 'safety alignment' extend beyond what comparison against a single weak baseline demonstrates."
    141       },
    142       "generalization_bounded": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper tests only three 7B open-source models but makes broad claims about 'LLM safety' generally. The title 'Advancing LLM Safe Alignment' and claims like 'SRR serves as a practical and effective safeguard module for LLM alignment' are not bounded to the tested model sizes, architectures, or attack types."
    146       },
    147       "alternative_explanations_discussed": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No alternative explanations for results are discussed. The keyword-based labeling confound (the ranker may learn keyword patterns rather than genuine safety-relevant representations) is not addressed. No consideration of whether simpler methods could achieve similar results."
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The paper specifies 'Qwen2.5-7b-Instruct', 'Mistral-7-v0.3', and 'Vicuna-7b-v1.5' with references. These are specific open-source model checkpoints with version identifiers."
    158       },
    159       "prompts_provided": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper references 'In-context Attack and In-context Defense' for response sampling but does not provide the actual prompt text or demonstrations used. The reader cannot reconstruct what was sent to the models."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "Ranker hyperparameters are reported (learning rate 0.001, weight decay 0.0001, dropout 0.1, momentum 1.0, bottom 25% layers). However, the LLM sampling parameters are not specified — Section 3.1 says 'stochastic decoding with moderate temperature' without giving the exact temperature value, top-p, or max tokens. Per the schema, 'If the paper uses an LLM API without stating temperature/sampling settings, NO.'"
    168       },
    169       "scaffolding_described": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The SRR framework is described in detail in Section 3: candidate response generation (3.1), ranker architecture with three steps (3.2), training objectives (3.3), and pseudocode in Algorithm 1. The inference pipeline is also described."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 4.1 documents the preprocessing pipeline: sample 20 responses each from In-context Attack and In-context Defense per prompt, filter using keyword criteria (safe: 'Sorry', 'unable', 'illegal', 'understand'; harmful: 'sure', 'certainly'), split 50 for training and the rest for testing."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 6 is titled 'Limitations' and contains substantive discussion of the approach's limitations."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The limitations in Section 6 are generic: 'might need task-specific fine-tuning,' 'adaptability to special-domain safety scenarios requires further testing,' 'effectiveness partly relies on diverse candidate responses.' These are not specific to this study's design — they don't address the keyword-based labeling confound, the weak baseline, the small sample sizes, or the limited model diversity."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No explicit statements about what the results do NOT show. The paper makes broad claims about 'real-world deployment' and 'safeguard module for LLM alignment' without bounding these to the three 7B models and three safety benchmarks actually tested."
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The sampled LLM responses, filtered training/test splits, keyword-labeled data, and trained ranker weights are not released. Only the original benchmark datasets are publicly available."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 4.1 describes the data collection procedure: sampling from base models using In-context Attack and In-context Defense, 20 times each per prompt. Keyword filtering criteria for labeling safe vs. harmful responses are explicitly stated."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No human participants are involved. Data comes from LLM-generated responses to existing safety benchmarks. This criterion does not apply."
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The data pipeline is documented: (1) sample 20 attack and 20 defense responses per prompt, (2) filter by keywords to assign safety labels, (3) split into 50 training / rest test, (4) construct candidate pairs for ranking."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No acknowledgments section or funding disclosure is present. There is no mention of grants, funding agencies, or corporate sponsors anywhere in the paper."
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "All authors are listed with affiliations at Peking University (State Key Lab of General Artificial Intelligence, School of Intelligence Science and Technology, School of Mathematical Sciences, Institute for Artificial Intelligence). The paper evaluates open-source models not affiliated with the authors' institution."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No funding source is disclosed. The schema says 'NA if unfunded,' but the paper does not state it is unfunded — it simply lacks any funding disclosure. An absent funding statement differs from confirmed absence of funding. Independence cannot be verified without disclosure."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No competing interests statement or declaration of financial interests appears in the paper. The absence of a disclosure statement does not confirm absence of conflicts."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The paper uses Qwen2.5-7b-Instruct, Mistral-7b-v0.3, and Vicuna-7b-v1.5 but does not state training data cutoff dates for any model. This matters because the safety benchmarks may have been in the models' training data, which could affect the distribution of safe vs. harmful responses generated."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No discussion of potential overlap between safety benchmark prompts and base model training data. HarmBench and JailbreakBench are public benchmarks that could appear in pre-training corpora."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "HarmBench (ICML 2024), JailbreakBench (2024), and SorryBench (ICLR 2025) were published before the likely training cutoffs of the base models. The paper does not address whether benchmark contamination could affect results."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved. All experiments use LLM-generated responses evaluated against automated benchmark labels."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. IRB approval is not applicable."
    268       },
    269       "demographics_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "randomization_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "blinding_described": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       },
    289       "attrition_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants in this study."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper claims SRR is 'lightweight' and overhead is 'negligible compared to full decoding,' but no actual cost figures are provided. No token counts, wall-clock time, or cost-per-example are reported. Generating 20 candidates per prompt is a significant expense that is not quantified."
    300       },
    301       "compute_budget_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No GPU hours, hardware specifications, training time, or total compute budget is reported. The ranker has <5M parameters, but the cost of running the 7B base model to extract representations is not quantified."
    305       }
    306     }
    307   }
    308 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs