calibration.json (22461B)
1 { 2 "paper_slug": "african-woman-rhythmic-2024", 3 "calibrator": "opus", 4 "calibration_date": "2026-02-28", 5 "total_questions": 50, 6 "agreement_count": 46, 7 "disagreement_count": 4, 8 "agreement_rate": 0.92, 9 "disagreements": [ 10 { 11 "category": "evaluation_design", 12 "question": "baselines_contemporary", 13 "sonnet": {"applies": false, "answer": false}, 14 "opus": {"applies": true, "answer": false}, 15 "direction": "applies_boundary", 16 "explanation": "Sonnet set applies=false reasoning that since no baselines were included, the question of whether baselines are contemporary is inapplicable. However, the schema says applies=false is for structurally inapplicable criteria. This paper could and should have compared against other LLMs or prior bias detection methods. The fact that no baselines were included makes this applies=true, answer=false (the paper fails the criterion), not applies=false. The question is relevant to this paper type — it just wasn't done." 17 }, 18 { 19 "category": "evaluation_design", 20 "question": "human_evaluation", 21 "sonnet": {"applies": true, "answer": true}, 22 "opus": {"applies": true, "answer": false}, 23 "direction": "sonnet_generous", 24 "explanation": "Sonnet credited the authors' thematic analysis of stories and word clouds as 'human evaluation'. However, the schema description specifies human evaluation means 'human ratings, manual inspection, user studies, expert review of the system's OUTPUTS. The humans must be evaluating what the system produced.' The authors did analyze outputs qualitatively, but this was the researchers themselves doing an informal thematic analysis with no coding scheme, no inter-rater reliability, and no systematic evaluation protocol. The schema also says 'If evaluation of the system is entirely automated (e.g., pass/fail on test suites), NO.' The paper's approach is neither automated metrics nor a proper human evaluation study — it is informal qualitative observation by the authors. Without a systematic coding scheme or evaluation protocol, this does not constitute human evaluation as described by the schema." 25 }, 26 { 27 "category": "contamination", 28 "question": "training_cutoff_stated", 29 "sonnet": {"applies": true, "answer": false}, 30 "opus": {"applies": false, "answer": false}, 31 "direction": "applies_boundary", 32 "explanation": "Sonnet set applies=true, arguing the training cutoff is relevant to understanding what biases GPT-4 absorbed. However, the schema says this question is about whether 'the model's training data cutoff date [is] stated' in the context of assessing 'whether test examples could have been in the training set.' The NA rule says: 'NA if the paper does not evaluate a pre-trained model's capability on any benchmark.' This paper does not evaluate GPT-4 on any benchmark — it uses novel author-designed prompts to probe for bias. The concern here is not benchmark contamination but rather what societal biases exist in training data, which is a different question. The contamination category is about 'could the model have seen the test data during training' — which is not relevant to this paper's design." 33 }, 34 { 35 "category": "conflicts_of_interest", 36 "question": "funding_disclosed", 37 "sonnet": {"applies": true, "answer": false}, 38 "opus": {"applies": false, "answer": false}, 39 "direction": "applies_boundary", 40 "explanation": "Sonnet set applies=true and answer=false. However, the schema says 'NA only if it's clearly unfunded work (e.g., a solo independent researcher).' This paper lists authors from UCL with correspondence emails in the format 'serene.lim.21@ucl.ac.uk' — the '.21' strongly suggests a student email (enrolled 2021). The paper has no acknowledgments section, no grant numbers, and appears to be a student research project. While not exactly a 'solo independent researcher,' a student project at a university with no external funding disclosed is reasonably 'clearly unfunded work' and applies=false is defensible. That said, this is a borderline case — UCL students do sometimes have funded projects. I set applies=false because the indicators point to unfunded student work." 41 } 42 ], 43 "opus_checklist": { 44 "artifacts": { 45 "code_released": { 46 "applies": true, 47 "answer": false, 48 "justification": "No source code, GitHub repository, or code archive is provided anywhere in the paper. The paper mentions data is stored on a secure database linked to the OpenAI API, but no code for reproducing the experiments is released." 49 }, 50 "data_released": { 51 "applies": true, 52 "answer": false, 53 "justification": "No dataset is released. The Ethics section mentions 'experimental data has been recorded on a secure database' but no public access is provided. The raw LLM outputs are not available for download." 54 }, 55 "environment_specified": { 56 "applies": true, 57 "answer": false, 58 "justification": "No environment specifications are provided. The paper only mentions 'GPT-4 via the interface of ChatGPT' with no library versions, API details, Python version, or dependency files." 59 }, 60 "reproduction_instructions": { 61 "applies": true, 62 "answer": false, 63 "justification": "No step-by-step reproduction instructions are included. The methodology describes the experimental approach in narrative form but does not provide scripts, commands, or a structured protocol that would allow a researcher to replicate the study without guessing." 64 } 65 }, 66 "statistical_methodology": { 67 "confidence_intervals_or_error_bars": { 68 "applies": true, 69 "answer": false, 70 "justification": "No confidence intervals or error bars are reported anywhere. Results are presented as raw counts (e.g., '9 of 10 trials', '8 of 8 positive words') and approximate percentages (e.g., 'around 80%') with no uncertainty quantification." 71 }, 72 "significance_tests": { 73 "applies": true, 74 "answer": false, 75 "justification": "No statistical significance tests are used. The paper makes comparative claims (e.g., IAT correlates with traditional methods, gender effects differ from race effects) without any formal statistical testing — no p-values, t-tests, chi-squared tests, or similar." 76 }, 77 "effect_sizes_reported": { 78 "applies": true, 79 "answer": false, 80 "justification": "No standardized effect sizes (Cohen's d, odds ratios, etc.) are reported. Results are given as raw counts and percentages without baseline context or standardized measures of effect magnitude." 81 }, 82 "sample_size_justified": { 83 "applies": true, 84 "answer": false, 85 "justification": "The choice of 10 trials per experiment is not justified anywhere. No power analysis is discussed, and there is no acknowledgment that 10 repetitions may be insufficient for the claims being made." 86 }, 87 "variance_reported": { 88 "applies": true, 89 "answer": false, 90 "justification": "No variance, standard deviation, or spread measure is reported across the 10 experimental trials. Results are presented as aggregate counts (e.g., '9 of 10') without any measure of variability." 91 } 92 }, 93 "evaluation_design": { 94 "baselines_included": { 95 "applies": true, 96 "answer": false, 97 "justification": "No baselines are included. The paper only tests GPT-4 and does not compare against other LLMs, prior bias measurement tools, random baselines, or any reference point for contextualizing the findings." 98 }, 99 "baselines_contemporary": { 100 "applies": true, 101 "answer": false, 102 "justification": "The paper could and should have included contemporary baselines (other LLMs, prior bias detection methods). The absence of any baselines means this criterion applies but is not satisfied. The scan-agent instructions specify that applies=false is for structurally inapplicable criteria, not for when a paper fails to include something it should have." 103 }, 104 "ablation_study": { 105 "applies": true, 106 "answer": false, 107 "justification": "No ablation study is conducted. The paper tests five different experimental tasks but does not systematically isolate variables (e.g., prompt phrasing, name selection, word list composition) to determine which factors drive the observed biases." 108 }, 109 "multiple_metrics": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper uses multiple assessment approaches: LLM IAT (word association counts), Decision Bias (profile-based task allocation), Sycophancy (behavioral observation), Word Generation (word cloud analysis), and Story Generation (thematic analysis). These constitute multiple complementary evaluation methods." 113 }, 114 "human_evaluation": { 115 "applies": true, 116 "answer": false, 117 "justification": "The authors informally analyzed GPT-4 outputs through thematic observation, but this does not constitute a proper human evaluation study. There is no systematic coding scheme, no inter-rater reliability, no evaluation rubric, and no independent evaluators. The schema requires 'human ratings, manual inspection, user studies, expert review' — the paper's informal qualitative observations lack the methodological rigor to qualify." 118 }, 119 "held_out_test_set": { 120 "applies": false, 121 "answer": false, 122 "justification": "Not applicable — this is not a machine learning training study with train/dev/test splits. The paper uses the ChatGPT interface with manually designed prompts; there is no data split relevant here." 123 }, 124 "per_category_breakdown": { 125 "applies": true, 126 "answer": true, 127 "justification": "Results are reported separately for each of the five experimental tasks (LLM IAT, Decision Bias, Sycophancy, Word Generation, Story Generation) and further broken down by attribute type (gender-career, race-valence, Muslim-others IAT)." 128 }, 129 "failure_cases_discussed": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper discusses cases where the approach did not work as expected: GPT-4 rejected the visual IAT '90% of the time' when sensitive characteristics were mentioned, and the race-only Decision Bias results 'did not show much difference.' These qualify as discussion of failure cases." 133 }, 134 "negative_results_reported": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper reports that race-only prompts in the Decision Bias task 'did not show much difference' and that the model assigned gender to gender-neutral names rather than showing racial bias. The visual IAT failure is also a negative result. These count as reporting things that did not work as hypothesized." 138 } 139 }, 140 "claims_and_evidence": { 141 "abstract_claims_supported": { 142 "applies": true, 143 "answer": false, 144 "justification": "The abstract claims the LLM IAT Bias 'correlates with traditional methods and more effectively predicts downstream behaviors' but no quantitative correlation analysis is presented in the paper. The paper shows only qualitative consistency (both methods reveal bias) without statistical correlation. The claim of 'a more comprehensive framework' is not empirically validated against prior frameworks." 145 }, 146 "causal_claims_justified": { 147 "applies": true, 148 "answer": false, 149 "justification": "The paper makes causal claims including 'LLMs trained on biased data perpetuate biases,' 'RLHF causes sycophancy,' and 'biases become entrenched within AI systems.' The study design is purely observational — prompt-and-observe with no controlled manipulation of training data, RLHF parameters, or other causal factors. The causal claims are not supported by the study design." 150 }, 151 "generalization_bounded": { 152 "applies": true, 153 "answer": false, 154 "justification": "The paper tests only GPT-4 via the ChatGPT interface but makes broad claims about 'LLMs' generally. The title, abstract, and discussion all frame findings as applicable to 'Large Language Models' and 'AI systems' without bounding claims to GPT-4 specifically. The paper does not state that results may not generalize to other models." 155 }, 156 "alternative_explanations_discussed": { 157 "applies": true, 158 "answer": false, 159 "justification": "No substantive alternative explanations are discussed. The paper does not consider confounds such as prompt phrasing effects, temperature/sampling settings, whether results reflect RLHF guardrails vs. base model behavior, or whether the specific name choices (Rebecca/John) drive results. The brief note that race was subordinated to gender is an observation, not an analysis of alternative explanations." 160 } 161 }, 162 "setup_transparency": { 163 "model_versions_specified": { 164 "applies": true, 165 "answer": false, 166 "justification": "The paper states only 'GPT-4 via the interface of ChatGPT' without specifying a version number, API snapshot date, or model identifier (e.g., 'gpt-4-0613'). Per the schema, 'GPT-4' without a version does not meet the criterion." 167 }, 168 "prompts_provided": { 169 "applies": true, 170 "answer": true, 171 "justification": "Actual prompt text is provided for the key experiments: the LLM IAT prompt with the word list and name assignment instruction, the Decision Bias profile generation prompt, the Word Generation prompt ('Please brainstorm 10 words for the name Rebecca'), and the Story Generation prompts are quoted directly in the methodology section." 172 }, 173 "hyperparameters_reported": { 174 "applies": true, 175 "answer": false, 176 "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported. The paper uses the ChatGPT web interface, and the default settings used are not documented. These settings significantly affect output variability." 177 }, 178 "scaffolding_described": { 179 "applies": false, 180 "answer": false, 181 "justification": "No agentic scaffolding is used. The paper accesses GPT-4 through the ChatGPT interface with direct single-turn prompts (and occasional follow-ups); there is no agentic loop, tool use, or multi-step scaffolding system." 182 }, 183 "data_preprocessing_documented": { 184 "applies": true, 185 "answer": false, 186 "justification": "No data preprocessing documentation is provided. The paper goes from collecting LLM outputs to reporting results without describing how qualitative themes were identified, whether multiple coders were used, how word clouds were generated from outputs, or what coding scheme was applied to the thematic analysis." 187 } 188 }, 189 "limitations_and_scope": { 190 "limitations_section_present": { 191 "applies": true, 192 "answer": false, 193 "justification": "There is no dedicated limitations or threats-to-validity section. The paper has a brief Ethics section that only addresses privacy and data handling, not methodological limitations. The Conclusion mentions 'challenges' but does not substantively discuss limitations of the study itself." 194 }, 195 "threats_to_validity_specific": { 196 "applies": true, 197 "answer": false, 198 "justification": "No specific threats to validity are discussed. The paper does not address the small sample size (n=10), lack of inter-rater reliability for qualitative coding, potential prompt phrasing effects, model version instability, or generalizability to other LLMs." 199 }, 200 "scope_boundaries_stated": { 201 "applies": true, 202 "answer": false, 203 "justification": "No scope boundaries are explicitly stated. The paper does not specify that results apply only to GPT-4, only to the tested prompts, only to the specific names chosen, or only to the English language. Claims generalize broadly to 'LLMs' and 'AI systems' without stating what the paper does NOT claim." 204 } 205 }, 206 "data_integrity": { 207 "raw_data_available": { 208 "applies": true, 209 "answer": false, 210 "justification": "Raw data (actual LLM response transcripts) is not released. The Ethics section mentions data is stored on a secure database but no public access is provided for independent verification." 211 }, 212 "data_collection_described": { 213 "applies": true, 214 "answer": true, 215 "justification": "The data collection procedure is described: experiments were conducted via GPT-4 through the ChatGPT interface, each task was repeated 10 times with specific prompts provided verbatim, and the order of stimuli was randomized. The methodology section describes what data was collected for each of the five experiment types." 216 }, 217 "recruitment_methods_described": { 218 "applies": false, 219 "answer": false, 220 "justification": "Not applicable — there are no human participants in this study. The research collects and analyzes LLM outputs rather than recruiting human subjects." 221 }, 222 "data_pipeline_documented": { 223 "applies": true, 224 "answer": false, 225 "justification": "The pipeline from raw LLM outputs to reported findings is not documented. There is no description of how word clouds were constructed from raw outputs, how thematic categories were derived, whether multiple coders were involved, or what criteria were used to identify themes in the story analysis." 226 } 227 }, 228 "conflicts_of_interest": { 229 "funding_disclosed": { 230 "applies": false, 231 "answer": false, 232 "justification": "The paper appears to be unfunded student work at UCL (author email format 'serene.lim.21@ucl.ac.uk' suggests student enrollment). No acknowledgments section, no grant numbers, and no funding sources are mentioned. Per the schema, NA applies for 'clearly unfunded work.'" 233 }, 234 "affiliations_disclosed": { 235 "applies": true, 236 "answer": true, 237 "justification": "Author affiliations are clearly stated as University College London, London, United Kingdom in the paper header. Neither author appears to be affiliated with OpenAI, whose product is being evaluated." 238 }, 239 "funder_independent_of_outcome": { 240 "applies": false, 241 "answer": false, 242 "justification": "Not applicable — no funding is disclosed, and the paper appears to be unfunded student work. The schema says NA if unfunded." 243 }, 244 "financial_interests_declared": { 245 "applies": true, 246 "answer": false, 247 "justification": "No competing interests statement appears anywhere in the paper. There is no disclosure of patents, equity holdings, or other financial interests related to the findings. The schema states: 'If there is no competing interests statement at all, NO.'" 248 } 249 }, 250 "contamination": { 251 "training_cutoff_stated": { 252 "applies": false, 253 "answer": false, 254 "justification": "Not applicable — this paper does not evaluate GPT-4's capability on any benchmark. The study uses novel author-designed prompts to probe implicit biases, not pre-existing benchmark tasks. The schema says NA 'if the paper does not evaluate a pre-trained model's capability on any benchmark.'" 255 }, 256 "train_test_overlap_discussed": { 257 "applies": false, 258 "answer": false, 259 "justification": "Not applicable — the paper does not evaluate a pre-trained model on any benchmark. The prompts are original author-designed stimuli adapted from psychological literature, not a fixed ML benchmark corpus." 260 }, 261 "benchmark_contamination_addressed": { 262 "applies": false, 263 "answer": false, 264 "justification": "Not applicable — the study does not use a pre-existing benchmark dataset. The IAT-inspired prompts and decision bias scenarios are novel author-constructed stimuli, not taken from a published ML benchmark." 265 } 266 }, 267 "human_studies": { 268 "pre_registered": { 269 "applies": false, 270 "answer": false, 271 "justification": "Not applicable — no human participants are involved. The study evaluates LLM outputs, not human subjects." 272 }, 273 "irb_or_ethics_approval": { 274 "applies": false, 275 "answer": false, 276 "justification": "Not applicable — no human participants are involved. The Ethics section confirms: 'This form of prompt engineering will not have any implications on any other participants, nor will any privacy guidelines be breached.'" 277 }, 278 "demographics_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "Not applicable — no human participants are involved in the study." 282 }, 283 "inclusion_exclusion_criteria": { 284 "applies": false, 285 "answer": false, 286 "justification": "Not applicable — no human participants are involved." 287 }, 288 "randomization_described": { 289 "applies": false, 290 "answer": false, 291 "justification": "Not applicable — no human participants or experimental groups. The paper mentions 'randomizing the order of stimuli' but this is in LLM prompting, not human participant randomization." 292 }, 293 "blinding_described": { 294 "applies": false, 295 "answer": false, 296 "justification": "Not applicable — no human participants are involved in the study." 297 }, 298 "attrition_reported": { 299 "applies": false, 300 "answer": false, 301 "justification": "Not applicable — no human participants are involved, so attrition is not relevant." 302 } 303 }, 304 "cost_and_practicality": { 305 "inference_cost_reported": { 306 "applies": true, 307 "answer": false, 308 "justification": "No inference cost, API cost, or token consumption is reported. The paper uses the ChatGPT interface for GPT-4 queries but provides no estimate of costs incurred or tokens consumed." 309 }, 310 "compute_budget_stated": { 311 "applies": true, 312 "answer": false, 313 "justification": "No compute budget is stated. The total number of API interactions, wall-clock time, or cost estimate is not provided." 314 } 315 } 316 } 317 }