calibration-summary.json (12785B)
1 { 2 "round": 3, 3 "date": "2026-02-28", 4 "checklist_version": "two-field-boolean-v1", 5 "papers_audited": 60, 6 "total_questions": 3000, 7 "total_agreements": 2911, 8 "total_disagreements": 139, 9 "overall_agreement_rate": 0.9703, 10 "mean_per_paper_agreement": 0.9538, 11 "direction_breakdown": { 12 "applies_boundary": 72, 13 "sonnet_generous": 50, 14 "opus_generous": 17 15 }, 16 "most_disagreed_questions": [ 17 { 18 "question": "human_evaluation", 19 "count": 14, 20 "directions": { 21 "applies_boundary": 11, 22 "sonnet_generous": 3 23 } 24 }, 25 { 26 "question": "effect_sizes_reported", 27 "count": 12, 28 "directions": { 29 "opus_generous": 9, 30 "sonnet_generous": 2, 31 "applies_boundary": 1 32 } 33 }, 34 { 35 "question": "funder_independent_of_outcome", 36 "count": 9, 37 "directions": { 38 "applies_boundary": 9 39 } 40 }, 41 { 42 "question": "prompts_provided", 43 "count": 6, 44 "directions": { 45 "sonnet_generous": 5, 46 "opus_generous": 1 47 } 48 }, 49 { 50 "question": "model_versions_specified", 51 "count": 5, 52 "directions": { 53 "sonnet_generous": 5 54 } 55 }, 56 { 57 "question": "failure_cases_discussed", 58 "count": 5, 59 "directions": { 60 "opus_generous": 1, 61 "sonnet_generous": 3, 62 "applies_boundary": 1 63 } 64 }, 65 { 66 "question": "training_cutoff_stated", 67 "count": 5, 68 "directions": { 69 "applies_boundary": 5 70 } 71 }, 72 { 73 "question": "train_test_overlap_discussed", 74 "count": 4, 75 "directions": { 76 "applies_boundary": 4 77 } 78 }, 79 { 80 "question": "generalization_bounded", 81 "count": 4, 82 "directions": { 83 "sonnet_generous": 4 84 } 85 }, 86 { 87 "question": "environment_specified", 88 "count": 4, 89 "directions": { 90 "sonnet_generous": 3, 91 "applies_boundary": 1 92 } 93 }, 94 { 95 "question": "multiple_metrics", 96 "count": 4, 97 "directions": { 98 "sonnet_generous": 2, 99 "applies_boundary": 2 100 } 101 }, 102 { 103 "question": "benchmark_contamination_addressed", 104 "count": 3, 105 "directions": { 106 "applies_boundary": 3 107 } 108 }, 109 { 110 "question": "alternative_explanations_discussed", 111 "count": 3, 112 "directions": { 113 "sonnet_generous": 3 114 } 115 }, 116 { 117 "question": "ablation_study", 118 "count": 3, 119 "directions": { 120 "sonnet_generous": 2, 121 "applies_boundary": 1 122 } 123 }, 124 { 125 "question": "baselines_contemporary", 126 "count": 3, 127 "directions": { 128 "applies_boundary": 3 129 } 130 }, 131 { 132 "question": "pre_registered", 133 "count": 3, 134 "directions": { 135 "applies_boundary": 3 136 } 137 }, 138 { 139 "question": "irb_or_ethics_approval", 140 "count": 3, 141 "directions": { 142 "applies_boundary": 3 143 } 144 }, 145 { 146 "question": "held_out_test_set", 147 "count": 3, 148 "directions": { 149 "sonnet_generous": 2, 150 "applies_boundary": 1 151 } 152 }, 153 { 154 "question": "hyperparameters_reported", 155 "count": 2, 156 "directions": { 157 "applies_boundary": 1, 158 "sonnet_generous": 1 159 } 160 }, 161 { 162 "question": "compute_budget_stated", 163 "count": 2, 164 "directions": { 165 "opus_generous": 1, 166 "sonnet_generous": 1 167 } 168 }, 169 { 170 "question": "code_released", 171 "count": 2, 172 "directions": { 173 "sonnet_generous": 2 174 } 175 }, 176 { 177 "question": "raw_data_available", 178 "count": 2, 179 "directions": { 180 "sonnet_generous": 1, 181 "applies_boundary": 1 182 } 183 }, 184 { 185 "question": "variance_reported", 186 "count": 2, 187 "directions": { 188 "sonnet_generous": 2 189 } 190 }, 191 { 192 "question": "causal_claims_justified", 193 "count": 2, 194 "directions": { 195 "opus_generous": 1, 196 "sonnet_generous": 1 197 } 198 }, 199 { 200 "question": "funding_disclosed", 201 "count": 2, 202 "directions": { 203 "opus_generous": 1, 204 "applies_boundary": 1 205 } 206 }, 207 { 208 "question": "data_released", 209 "count": 2, 210 "directions": { 211 "sonnet_generous": 1, 212 "applies_boundary": 1 213 } 214 }, 215 { 216 "question": "inference_cost_reported", 217 "count": 2, 218 "directions": { 219 "opus_generous": 1, 220 "sonnet_generous": 1 221 } 222 }, 223 { 224 "question": "demographics_reported", 225 "count": 2, 226 "directions": { 227 "applies_boundary": 2 228 } 229 }, 230 { 231 "question": "inclusion_exclusion_criteria", 232 "count": 2, 233 "directions": { 234 "applies_boundary": 2 235 } 236 }, 237 { 238 "question": "blinding_described", 239 "count": 2, 240 "directions": { 241 "applies_boundary": 2 242 } 243 }, 244 { 245 "question": "significance_tests", 246 "count": 2, 247 "directions": { 248 "applies_boundary": 2 249 } 250 }, 251 { 252 "question": "per_category_breakdown", 253 "count": 2, 254 "directions": { 255 "applies_boundary": 1, 256 "sonnet_generous": 1 257 } 258 } 259 ], 260 "per_paper": [ 261 { 262 "slug": "aegis20-diverse-ai-2025", 263 "agreement_rate": 0.84, 264 "disagreements": 8 265 }, 266 { 267 "slug": "agentbased-evaluation-framework-2025", 268 "agreement_rate": 0.84, 269 "disagreements": 8 270 }, 271 { 272 "slug": "aegis-automated-coevolutionary-2025", 273 "agreement_rate": 0.88, 274 "disagreements": 6 275 }, 276 { 277 "slug": "agentmesh-cooperative-multiagent-2025", 278 "agreement_rate": 0.88, 279 "disagreements": 6 280 }, 281 { 282 "slug": "accelerating-automatic-program-2025", 283 "agreement_rate": 0.9, 284 "disagreements": 5 285 }, 286 { 287 "slug": "adaptevolve-improving-efficiency-2026", 288 "agreement_rate": 0.9, 289 "disagreements": 5 290 }, 291 { 292 "slug": "agentasajudge-evaluate-agents-2024", 293 "agreement_rate": 0.9, 294 "disagreements": 5 295 }, 296 { 297 "slug": "agentdojo-dynamic-environment-2024", 298 "agreement_rate": 0.9, 299 "disagreements": 5 300 }, 301 { 302 "slug": "3dshape2vecset-3d-shape-2023", 303 "agreement_rate": 0.92, 304 "disagreements": 4 305 }, 306 { 307 "slug": "adapting-knowledge-prompt-2025", 308 "agreement_rate": 0.92, 309 "disagreements": 4 310 }, 311 { 312 "slug": "adaptive-visionbased-coverage-2025", 313 "agreement_rate": 0.92, 314 "disagreements": 4 315 }, 316 { 317 "slug": "adaptrack-constrained-decoding-2025", 318 "agreement_rate": 0.92, 319 "disagreements": 4 320 }, 321 { 322 "slug": "aegisagent-autonomous-defense-2025", 323 "agreement_rate": 0.92, 324 "disagreements": 4 325 }, 326 { 327 "slug": "african-woman-rhythmic-2024", 328 "agreement_rate": 0.92, 329 "disagreements": 4 330 }, 331 { 332 "slug": "agentbench-evaluating-llms-2023", 333 "agreement_rate": 0.92, 334 "disagreements": 4 335 }, 336 { 337 "slug": "agents-of-chaos-2026", 338 "agreement_rate": 0.92, 339 "disagreements": 4 340 }, 341 { 342 "slug": "aart-aiassisted-redteaming-2023", 343 "agreement_rate": 0.94, 344 "disagreements": 3 345 }, 346 { 347 "slug": "adaptive-attacks-break-2025", 348 "agreement_rate": 0.94, 349 "disagreements": 3 350 }, 351 { 352 "slug": "advancing-llm-safe-2025", 353 "agreement_rate": 0.94, 354 "disagreements": 3 355 }, 356 { 357 "slug": "agent-contracts-formal-2026", 358 "agreement_rate": 0.94, 359 "disagreements": 3 360 }, 361 { 362 "slug": "agentic-ai-assessment-framework-2025", 363 "agreement_rate": 0.94, 364 "disagreements": 3 365 }, 366 { 367 "slug": "agentsnet-coordination-collaborative-2025", 368 "agreement_rate": 0.94, 369 "disagreements": 3 370 }, 371 { 372 "slug": "agentspawn-adaptive-multiagent-2026", 373 "agreement_rate": 0.94, 374 "disagreements": 3 375 }, 376 { 377 "slug": "2025-ai-agent-2026", 378 "agreement_rate": 0.96, 379 "disagreements": 2 380 }, 381 { 382 "slug": "acar-adaptive-complexity-2026", 383 "agreement_rate": 0.96, 384 "disagreements": 2 385 }, 386 { 387 "slug": "across-programming-language-2025", 388 "agreement_rate": 0.96, 389 "disagreements": 2 390 }, 391 { 392 "slug": "adafuse-adaptive-ensemble-2026", 393 "agreement_rate": 0.96, 394 "disagreements": 2 395 }, 396 { 397 "slug": "adaptive-data-augmentation-2026", 398 "agreement_rate": 0.96, 399 "disagreements": 2 400 }, 401 { 402 "slug": "advancing-language-model-2025", 403 "agreement_rate": 0.96, 404 "disagreements": 2 405 }, 406 { 407 "slug": "advancing-software-quality-2025", 408 "agreement_rate": 0.96, 409 "disagreements": 2 410 }, 411 { 412 "slug": "agentic-ai-software-2025", 413 "agreement_rate": 0.96, 414 "disagreements": 2 415 }, 416 { 417 "slug": "agentless-2024", 418 "agreement_rate": 0.96, 419 "disagreements": 2 420 }, 421 { 422 "slug": "agenttypo-adaptive-typographic-2025", 423 "agreement_rate": 0.96, 424 "disagreements": 2 425 }, 426 { 427 "slug": "accelerating-large-language-2023", 428 "agreement_rate": 0.98, 429 "disagreements": 1 430 }, 431 { 432 "slug": "adaplanner-adaptive-planning-2023", 433 "agreement_rate": 0.98, 434 "disagreements": 1 435 }, 436 { 437 "slug": "adaptive-attacks-bypass-defenses-2025", 438 "agreement_rate": 0.98, 439 "disagreements": 1 440 }, 441 { 442 "slug": "adaptive-test-generation-2023", 443 "agreement_rate": 0.98, 444 "disagreements": 1 445 }, 446 { 447 "slug": "adoption-generative-artificial-2026", 448 "agreement_rate": 0.98, 449 "disagreements": 1 450 }, 451 { 452 "slug": "advancements-generative-ai-2023", 453 "agreement_rate": 0.98, 454 "disagreements": 1 455 }, 456 { 457 "slug": "advancing-code-generation-2025", 458 "agreement_rate": 0.98, 459 "disagreements": 1 460 }, 461 { 462 "slug": "adversarial-threat-vectors-2025", 463 "agreement_rate": 0.98, 464 "disagreements": 1 465 }, 466 { 467 "slug": "agent-error-taxonomy-2025", 468 "agreement_rate": 0.98, 469 "disagreements": 1 470 }, 471 { 472 "slug": "agentask-multiagent-systems-2025", 473 "agreement_rate": 0.98, 474 "disagreements": 1 475 }, 476 { 477 "slug": "agentic-adoption-github-2026", 478 "agreement_rate": 0.98, 479 "disagreements": 1 480 }, 481 { 482 "slug": "agentic-ai-security-survey-2025", 483 "agreement_rate": 0.98, 484 "disagreements": 1 485 }, 486 { 487 "slug": "agentic-ai-software-2025-2", 488 "agreement_rate": 0.98, 489 "disagreements": 1 490 }, 491 { 492 "slug": "agentic-programming-survey-2025", 493 "agreement_rate": 0.98, 494 "disagreements": 1 495 }, 496 { 497 "slug": "agentic-refactoring-empirical-2025", 498 "agreement_rate": 0.98, 499 "disagreements": 1 500 }, 501 { 502 "slug": "agentsllm-augmentative-generation-2025", 503 "agreement_rate": 0.98, 504 "disagreements": 1 505 }, 506 { 507 "slug": "agentvigil-generic-blackbox-2025", 508 "agreement_rate": 0.98, 509 "disagreements": 1 510 }, 511 { 512 "slug": "agent-security-bench-2024", 513 "agreement_rate": 0.99, 514 "disagreements": 1 515 }, 516 { 517 "slug": "adversarial-bug-reports-2025", 518 "agreement_rate": 1.0, 519 "disagreements": 0 520 }, 521 { 522 "slug": "advevomarl-shaping-internalized-2025", 523 "agreement_rate": 1.0, 524 "disagreements": 0 525 }, 526 { 527 "slug": "agent-developer-practices-2025", 528 "agreement_rate": 1.0, 529 "disagreements": 0 530 }, 531 { 532 "slug": "agentfm-roleaware-failure-2025", 533 "agreement_rate": 1.0, 534 "disagreements": 0 535 }, 536 { 537 "slug": "agentic-ai-architectures-2026", 538 "agreement_rate": 1.0, 539 "disagreements": 0 540 }, 541 { 542 "slug": "agentic-bug-reproduction-2025", 543 "agreement_rate": 1.0, 544 "disagreements": 0 545 }, 546 { 547 "slug": "agentic-memory-learning-2026", 548 "agreement_rate": 1.0, 549 "disagreements": 0 550 }, 551 { 552 "slug": "agentic-software-engineering-2025", 553 "agreement_rate": 1.0, 554 "disagreements": 0 555 }, 556 { 557 "slug": "agents4plc-automating-closedloop-2024", 558 "agreement_rate": 1.0, 559 "disagreements": 0 560 } 561 ], 562 "perfect_agreement_count": 9, 563 "below_target_count": 23, 564 "comparison_with_previous_rounds": { 565 "round_1": { 566 "papers": 8, 567 "agreement_rate": 0.932, 568 "note": "Pre two-field design. 56% NA boundary, 44% generosity" 569 }, 570 "round_2": { 571 "papers": 10, 572 "agreement_rate": 0.962, 573 "note": "Pre two-field design. 47% NA boundary. Led to two-field redesign" 574 }, 575 "round_3": { 576 "papers": 60, 577 "agreement_rate": 0.9703, 578 "note": "Two-field boolean design. First full-scale calibration." 579 } 580 } 581 }