early-findings.json (25947B)
1 { 2 "n_papers": 561, 3 "per_question": { 4 "artifacts.code_released": { 5 "applies_count": 547, 6 "applies_rate": 0.975, 7 "yes_count": 295, 8 "compliance_rate": 0.5393 9 }, 10 "artifacts.data_released": { 11 "applies_count": 544, 12 "applies_rate": 0.9697, 13 "yes_count": 347, 14 "compliance_rate": 0.6379 15 }, 16 "artifacts.environment_specified": { 17 "applies_count": 511, 18 "applies_rate": 0.9109, 19 "yes_count": 48, 20 "compliance_rate": 0.0939 21 }, 22 "artifacts.reproduction_instructions": { 23 "applies_count": 534, 24 "applies_rate": 0.9519, 25 "yes_count": 30, 26 "compliance_rate": 0.0562 27 }, 28 "statistical_methodology.confidence_intervals_or_error_bars": { 29 "applies_count": 485, 30 "applies_rate": 0.8645, 31 "yes_count": 102, 32 "compliance_rate": 0.2103 33 }, 34 "statistical_methodology.significance_tests": { 35 "applies_count": 474, 36 "applies_rate": 0.8449, 37 "yes_count": 92, 38 "compliance_rate": 0.1941 39 }, 40 "statistical_methodology.effect_sizes_reported": { 41 "applies_count": 478, 42 "applies_rate": 0.852, 43 "yes_count": 342, 44 "compliance_rate": 0.7155 45 }, 46 "statistical_methodology.sample_size_justified": { 47 "applies_count": 488, 48 "applies_rate": 0.8699, 49 "yes_count": 22, 50 "compliance_rate": 0.0451 51 }, 52 "statistical_methodology.variance_reported": { 53 "applies_count": 482, 54 "applies_rate": 0.8592, 55 "yes_count": 104, 56 "compliance_rate": 0.2158 57 }, 58 "evaluation_design.baselines_included": { 59 "applies_count": 521, 60 "applies_rate": 0.9287, 61 "yes_count": 458, 62 "compliance_rate": 0.8791 63 }, 64 "evaluation_design.baselines_contemporary": { 65 "applies_count": 507, 66 "applies_rate": 0.9037, 67 "yes_count": 399, 68 "compliance_rate": 0.787 69 }, 70 "evaluation_design.ablation_study": { 71 "applies_count": 452, 72 "applies_rate": 0.8057, 73 "yes_count": 335, 74 "compliance_rate": 0.7412 75 }, 76 "evaluation_design.multiple_metrics": { 77 "applies_count": 491, 78 "applies_rate": 0.8752, 79 "yes_count": 415, 80 "compliance_rate": 0.8452 81 }, 82 "evaluation_design.human_evaluation": { 83 "applies_count": 367, 84 "applies_rate": 0.6542, 85 "yes_count": 151, 86 "compliance_rate": 0.4114 87 }, 88 "evaluation_design.held_out_test_set": { 89 "applies_count": 361, 90 "applies_rate": 0.6435, 91 "yes_count": 266, 92 "compliance_rate": 0.7368 93 }, 94 "evaluation_design.per_category_breakdown": { 95 "applies_count": 539, 96 "applies_rate": 0.9608, 97 "yes_count": 481, 98 "compliance_rate": 0.8924 99 }, 100 "evaluation_design.failure_cases_discussed": { 101 "applies_count": 555, 102 "applies_rate": 0.9893, 103 "yes_count": 460, 104 "compliance_rate": 0.8288 105 }, 106 "evaluation_design.negative_results_reported": { 107 "applies_count": 545, 108 "applies_rate": 0.9715, 109 "yes_count": 482, 110 "compliance_rate": 0.8844 111 }, 112 "claims_and_evidence.abstract_claims_supported": { 113 "applies_count": 561, 114 "applies_rate": 1.0, 115 "yes_count": 513, 116 "compliance_rate": 0.9144 117 }, 118 "claims_and_evidence.causal_claims_justified": { 119 "applies_count": 505, 120 "applies_rate": 0.9002, 121 "yes_count": 324, 122 "compliance_rate": 0.6416 123 }, 124 "claims_and_evidence.generalization_bounded": { 125 "applies_count": 561, 126 "applies_rate": 1.0, 127 "yes_count": 200, 128 "compliance_rate": 0.3565 129 }, 130 "claims_and_evidence.alternative_explanations_discussed": { 131 "applies_count": 534, 132 "applies_rate": 0.9519, 133 "yes_count": 189, 134 "compliance_rate": 0.3539 135 }, 136 "setup_transparency.model_versions_specified": { 137 "applies_count": 473, 138 "applies_rate": 0.8431, 139 "yes_count": 182, 140 "compliance_rate": 0.3848 141 }, 142 "setup_transparency.prompts_provided": { 143 "applies_count": 415, 144 "applies_rate": 0.7398, 145 "yes_count": 268, 146 "compliance_rate": 0.6458 147 }, 148 "setup_transparency.hyperparameters_reported": { 149 "applies_count": 483, 150 "applies_rate": 0.861, 151 "yes_count": 278, 152 "compliance_rate": 0.5756 153 }, 154 "setup_transparency.scaffolding_described": { 155 "applies_count": 238, 156 "applies_rate": 0.4242, 157 "yes_count": 224, 158 "compliance_rate": 0.9412 159 }, 160 "setup_transparency.data_preprocessing_documented": { 161 "applies_count": 529, 162 "applies_rate": 0.943, 163 "yes_count": 403, 164 "compliance_rate": 0.7618 165 }, 166 "limitations_and_scope.limitations_section_present": { 167 "applies_count": 561, 168 "applies_rate": 1.0, 169 "yes_count": 358, 170 "compliance_rate": 0.6381 171 }, 172 "limitations_and_scope.threats_to_validity_specific": { 173 "applies_count": 561, 174 "applies_rate": 1.0, 175 "yes_count": 301, 176 "compliance_rate": 0.5365 177 }, 178 "limitations_and_scope.scope_boundaries_stated": { 179 "applies_count": 561, 180 "applies_rate": 1.0, 181 "yes_count": 219, 182 "compliance_rate": 0.3904 183 }, 184 "data_integrity.raw_data_available": { 185 "applies_count": 535, 186 "applies_rate": 0.9537, 187 "yes_count": 216, 188 "compliance_rate": 0.4037 189 }, 190 "data_integrity.data_collection_described": { 191 "applies_count": 541, 192 "applies_rate": 0.9643, 193 "yes_count": 469, 194 "compliance_rate": 0.8669 195 }, 196 "data_integrity.recruitment_methods_described": { 197 "applies_count": 112, 198 "applies_rate": 0.1996, 199 "yes_count": 50, 200 "compliance_rate": 0.4464 201 }, 202 "data_integrity.data_pipeline_documented": { 203 "applies_count": 536, 204 "applies_rate": 0.9554, 205 "yes_count": 400, 206 "compliance_rate": 0.7463 207 }, 208 "conflicts_of_interest.funding_disclosed": { 209 "applies_count": 560, 210 "applies_rate": 0.9982, 211 "yes_count": 183, 212 "compliance_rate": 0.3268 213 }, 214 "conflicts_of_interest.affiliations_disclosed": { 215 "applies_count": 561, 216 "applies_rate": 1.0, 217 "yes_count": 555, 218 "compliance_rate": 0.9893 219 }, 220 "conflicts_of_interest.funder_independent_of_outcome": { 221 "applies_count": 509, 222 "applies_rate": 0.9073, 223 "yes_count": 157, 224 "compliance_rate": 0.3084 225 }, 226 "conflicts_of_interest.financial_interests_declared": { 227 "applies_count": 561, 228 "applies_rate": 1.0, 229 "yes_count": 23, 230 "compliance_rate": 0.041 231 }, 232 "contamination.training_cutoff_stated": { 233 "applies_count": 327, 234 "applies_rate": 0.5829, 235 "yes_count": 23, 236 "compliance_rate": 0.0703 237 }, 238 "contamination.train_test_overlap_discussed": { 239 "applies_count": 328, 240 "applies_rate": 0.5847, 241 "yes_count": 95, 242 "compliance_rate": 0.2896 243 }, 244 "contamination.benchmark_contamination_addressed": { 245 "applies_count": 323, 246 "applies_rate": 0.5758, 247 "yes_count": 82, 248 "compliance_rate": 0.2539 249 }, 250 "human_studies.pre_registered": { 251 "applies_count": 79, 252 "applies_rate": 0.1408, 253 "yes_count": 1, 254 "compliance_rate": 0.0127 255 }, 256 "human_studies.irb_or_ethics_approval": { 257 "applies_count": 79, 258 "applies_rate": 0.1408, 259 "yes_count": 10, 260 "compliance_rate": 0.1266 261 }, 262 "human_studies.demographics_reported": { 263 "applies_count": 80, 264 "applies_rate": 0.1426, 265 "yes_count": 34, 266 "compliance_rate": 0.425 267 }, 268 "human_studies.inclusion_exclusion_criteria": { 269 "applies_count": 80, 270 "applies_rate": 0.1426, 271 "yes_count": 23, 272 "compliance_rate": 0.2875 273 }, 274 "human_studies.randomization_described": { 275 "applies_count": 29, 276 "applies_rate": 0.0517, 277 "yes_count": 14, 278 "compliance_rate": 0.4828 279 }, 280 "human_studies.blinding_described": { 281 "applies_count": 47, 282 "applies_rate": 0.0838, 283 "yes_count": 22, 284 "compliance_rate": 0.4681 285 }, 286 "human_studies.attrition_reported": { 287 "applies_count": 77, 288 "applies_rate": 0.1373, 289 "yes_count": 15, 290 "compliance_rate": 0.1948 291 }, 292 "cost_and_practicality.inference_cost_reported": { 293 "applies_count": 466, 294 "applies_rate": 0.8307, 295 "yes_count": 156, 296 "compliance_rate": 0.3348 297 }, 298 "cost_and_practicality.compute_budget_stated": { 299 "applies_count": 470, 300 "applies_rate": 0.8378, 301 "yes_count": 96, 302 "compliance_rate": 0.2043 303 } 304 }, 305 "per_category": { 306 "artifacts": { 307 "mean": 0.3297, 308 "median": 0.25, 309 "std": 0.2654, 310 "n_papers_with_applicable": 548 311 }, 312 "statistical_methodology": { 313 "mean": 0.2727, 314 "median": 0.2, 315 "std": 0.2416, 316 "n_papers_with_applicable": 490 317 }, 318 "evaluation_design": { 319 "mean": 0.7924, 320 "median": 0.875, 321 "std": 0.2411, 322 "n_papers_with_applicable": 556 323 }, 324 "claims_and_evidence": { 325 "mean": 0.5735, 326 "median": 0.5, 327 "std": 0.2986, 328 "n_papers_with_applicable": 561 329 }, 330 "setup_transparency": { 331 "mean": 0.6194, 332 "median": 0.6, 333 "std": 0.3106, 334 "n_papers_with_applicable": 539 335 }, 336 "limitations_and_scope": { 337 "mean": 0.5217, 338 "median": 0.6667, 339 "std": 0.4164, 340 "n_papers_with_applicable": 561 341 }, 342 "data_integrity": { 343 "mean": 0.6548, 344 "median": 0.6667, 345 "std": 0.3195, 346 "n_papers_with_applicable": 542 347 }, 348 "conflicts_of_interest": { 349 "mean": 0.4186, 350 "median": 0.25, 351 "std": 0.238, 352 "n_papers_with_applicable": 561 353 }, 354 "contamination": { 355 "mean": 0.2053, 356 "median": 0.0, 357 "std": 0.3347, 358 "n_papers_with_applicable": 328 359 }, 360 "human_studies": { 361 "mean": 0.2555, 362 "median": 0.2, 363 "std": 0.2302, 364 "n_papers_with_applicable": 80 365 }, 366 "cost_and_practicality": { 367 "mean": 0.2696, 368 "median": 0.0, 369 "std": 0.3697, 370 "n_papers_with_applicable": 471 371 } 372 }, 373 "paper_score_distribution": { 374 "mean": 0.5093, 375 "median": 0.5278, 376 "std": 0.174, 377 "min": 0.0238, 378 "max": 0.878, 379 "q25": 0.4146, 380 "q75": 0.625 381 }, 382 "lowest_10": [ 383 { 384 "slug": "automating-rest-api-2024", 385 "year": 2024, 386 "score": 0.0238, 387 "applicable": 42, 388 "satisfied": 1, 389 "title": "Automating REST API Postman Test Cases Using LLM", 390 "methodology_tags": [ 391 "case-study" 392 ] 393 }, 394 { 395 "slug": "chatofthought-collaborative-multiagent-2025", 396 "year": 2025, 397 "score": 0.0256, 398 "applicable": 39, 399 "satisfied": 1, 400 "title": "Chat-of-Thought: Collaborative Multi-Agent System for Generating Domain Specific Information", 401 "methodology_tags": [ 402 "case-study" 403 ] 404 }, 405 { 406 "slug": "aidriven-software-engineering-2023", 407 "year": 2023, 408 "score": 0.027, 409 "applicable": 37, 410 "satisfied": 1, 411 "title": "AI-driven software engineering", 412 "methodology_tags": [ 413 "qualitative", 414 "observational" 415 ] 416 }, 417 { 418 "slug": "aiassisted-code-editors-2025", 419 "year": 2025, 420 "score": 0.0435, 421 "applicable": 23, 422 "satisfied": 1, 423 "title": "AI-Assisted Code Editors with Real-Time Collaboration: A Comprehensive Review", 424 "methodology_tags": [ 425 "meta-analysis" 426 ] 427 }, 428 { 429 "slug": "ai-agents-software-2025", 430 "year": 2025, 431 "score": 0.0526, 432 "applicable": 38, 433 "satisfied": 2, 434 "title": "AI Agents in Software Engineering Optimizing Software Development Processes and Enhancing Security Management in Learning Management Systems", 435 "methodology_tags": [ 436 "theoretical" 437 ] 438 }, 439 { 440 "slug": "attacking-llms-ai-2025", 441 "year": 2025, 442 "score": 0.0526, 443 "applicable": 38, 444 "satisfied": 2, 445 "title": "Attacking LLMs and AI Agents: Advertisement Embedding Attacks Against Large Language Models", 446 "methodology_tags": [ 447 "case-study" 448 ] 449 }, 450 { 451 "slug": "breaking-prompt-wall-2025", 452 "year": 2025, 453 "score": 0.0556, 454 "applicable": 36, 455 "satisfied": 2, 456 "title": "Breaking the Prompt Wall (I): A Real-World Case Study of Attacking ChatGPT via Lightweight Prompt Injection", 457 "methodology_tags": [ 458 "case-study" 459 ] 460 }, 461 { 462 "slug": "agentic-ai-software-2025", 463 "year": 2025, 464 "score": 0.0667, 465 "applicable": 15, 466 "satisfied": 1, 467 "title": "Agentic AI for Software: thoughts from Software Engineering community", 468 "methodology_tags": [ 469 "theoretical", 470 "case-study" 471 ] 472 }, 473 { 474 "slug": "aipowered-code-review-2024", 475 "year": 2024, 476 "score": 0.0769, 477 "applicable": 39, 478 "satisfied": 3, 479 "title": "AI-powered Code Review with LLMs: Early Results", 480 "methodology_tags": [ 481 "case-study" 482 ] 483 }, 484 { 485 "slug": "aipowered-software-development-2025-2", 486 "year": 2025, 487 "score": 0.0769, 488 "applicable": 13, 489 "satisfied": 1, 490 "title": "AI-Powered Software Development Life Cycle: From Requirements to Maintenance", 491 "methodology_tags": [ 492 "theoretical" 493 ] 494 } 495 ], 496 "highest_10": [ 497 { 498 "slug": "concrete-roadmap-safety-2025", 499 "year": 2025, 500 "score": 0.8333, 501 "applicable": 12, 502 "satisfied": 10, 503 "title": "A Concrete Roadmap towards Safety Cases based on Chain-of-Thought Monitoring", 504 "methodology_tags": [ 505 "theoretical" 506 ] 507 }, 508 { 509 "slug": "ai-alignment-strategies-2025", 510 "year": 2025, 511 "score": 0.8462, 512 "applicable": 13, 513 "satisfied": 11, 514 "title": "AI Alignment Strategies from a Risk Perspective: Independent Safety Mechanisms or Shared Failures?", 515 "methodology_tags": [ 516 "theoretical" 517 ] 518 }, 519 { 520 "slug": "ai-testing-should-2025", 521 "year": 2025, 522 "score": 0.8462, 523 "applicable": 13, 524 "satisfied": 11, 525 "title": "AI Testing Should Account for Sophisticated Strategic Behaviour", 526 "methodology_tags": [ 527 "theoretical" 528 ] 529 }, 530 { 531 "slug": "alphacode-competition-level-2022", 532 "year": 2022, 533 "score": 0.8571, 534 "applicable": 42, 535 "satisfied": 36, 536 "title": "Competition-Level Code Generation with AlphaCode", 537 "methodology_tags": [ 538 "benchmark-eval" 539 ] 540 }, 541 { 542 "slug": "appworld-controllable-world-2024", 543 "year": 2024, 544 "score": 0.8571, 545 "applicable": 42, 546 "satisfied": 36, 547 "title": "AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents", 548 "methodology_tags": [ 549 "benchmark-eval" 550 ] 551 }, 552 { 553 "slug": "agentdojo-dynamic-environment-2024", 554 "year": 2024, 555 "score": 0.8611, 556 "applicable": 36, 557 "satisfied": 31, 558 "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", 559 "methodology_tags": [ 560 "benchmark-eval" 561 ] 562 }, 563 { 564 "slug": "attention-pruning-automated-2025", 565 "year": 2025, 566 "score": 0.8684, 567 "applicable": 38, 568 "satisfied": 33, 569 "title": "Attention Pruning: Automated Fairness Repair of Language Models via Surrogate Simulated Annealing", 570 "methodology_tags": [ 571 "benchmark-eval" 572 ] 573 }, 574 { 575 "slug": "bridging-mde-ai-2023", 576 "year": 2023, 577 "score": 0.8696, 578 "applicable": 23, 579 "satisfied": 20, 580 "title": "Bridging MDE and AI: A Systematic Review of Domain-Specific Languages and Model-Driven Practices in AI Software Systems Engineering", 581 "methodology_tags": [ 582 "meta-analysis" 583 ] 584 }, 585 { 586 "slug": "annotation-alignment-comparing-2024", 587 "year": 2024, 588 "score": 0.8718, 589 "applicable": 39, 590 "satisfied": 34, 591 "title": "Annotation alignment: Comparing LLM and human annotations of conversational safety", 592 "methodology_tags": [ 593 "benchmark-eval", 594 "observational" 595 ] 596 }, 597 { 598 "slug": "assessing-latent-automated-2024", 599 "year": 2024, 600 "score": 0.878, 601 "applicable": 41, 602 "satisfied": 36, 603 "title": "Assessing the Latent Automated Program Repair Capabilities of Large Language Models using Round-Trip Translation", 604 "methodology_tags": [ 605 "benchmark-eval" 606 ] 607 } 608 ], 609 "lowest_compliance_questions": [ 610 [ 611 "human_studies.pre_registered", 612 { 613 "applies_count": 79, 614 "applies_rate": 0.1408, 615 "yes_count": 1, 616 "compliance_rate": 0.0127 617 } 618 ], 619 [ 620 "conflicts_of_interest.financial_interests_declared", 621 { 622 "applies_count": 561, 623 "applies_rate": 1.0, 624 "yes_count": 23, 625 "compliance_rate": 0.041 626 } 627 ], 628 [ 629 "statistical_methodology.sample_size_justified", 630 { 631 "applies_count": 488, 632 "applies_rate": 0.8699, 633 "yes_count": 22, 634 "compliance_rate": 0.0451 635 } 636 ], 637 [ 638 "artifacts.reproduction_instructions", 639 { 640 "applies_count": 534, 641 "applies_rate": 0.9519, 642 "yes_count": 30, 643 "compliance_rate": 0.0562 644 } 645 ], 646 [ 647 "contamination.training_cutoff_stated", 648 { 649 "applies_count": 327, 650 "applies_rate": 0.5829, 651 "yes_count": 23, 652 "compliance_rate": 0.0703 653 } 654 ], 655 [ 656 "artifacts.environment_specified", 657 { 658 "applies_count": 511, 659 "applies_rate": 0.9109, 660 "yes_count": 48, 661 "compliance_rate": 0.0939 662 } 663 ], 664 [ 665 "human_studies.irb_or_ethics_approval", 666 { 667 "applies_count": 79, 668 "applies_rate": 0.1408, 669 "yes_count": 10, 670 "compliance_rate": 0.1266 671 } 672 ], 673 [ 674 "statistical_methodology.significance_tests", 675 { 676 "applies_count": 474, 677 "applies_rate": 0.8449, 678 "yes_count": 92, 679 "compliance_rate": 0.1941 680 } 681 ], 682 [ 683 "human_studies.attrition_reported", 684 { 685 "applies_count": 77, 686 "applies_rate": 0.1373, 687 "yes_count": 15, 688 "compliance_rate": 0.1948 689 } 690 ], 691 [ 692 "cost_and_practicality.compute_budget_stated", 693 { 694 "applies_count": 470, 695 "applies_rate": 0.8378, 696 "yes_count": 96, 697 "compliance_rate": 0.2043 698 } 699 ], 700 [ 701 "statistical_methodology.confidence_intervals_or_error_bars", 702 { 703 "applies_count": 485, 704 "applies_rate": 0.8645, 705 "yes_count": 102, 706 "compliance_rate": 0.2103 707 } 708 ], 709 [ 710 "statistical_methodology.variance_reported", 711 { 712 "applies_count": 482, 713 "applies_rate": 0.8592, 714 "yes_count": 104, 715 "compliance_rate": 0.2158 716 } 717 ], 718 [ 719 "contamination.benchmark_contamination_addressed", 720 { 721 "applies_count": 323, 722 "applies_rate": 0.5758, 723 "yes_count": 82, 724 "compliance_rate": 0.2539 725 } 726 ], 727 [ 728 "human_studies.inclusion_exclusion_criteria", 729 { 730 "applies_count": 80, 731 "applies_rate": 0.1426, 732 "yes_count": 23, 733 "compliance_rate": 0.2875 734 } 735 ], 736 [ 737 "contamination.train_test_overlap_discussed", 738 { 739 "applies_count": 328, 740 "applies_rate": 0.5847, 741 "yes_count": 95, 742 "compliance_rate": 0.2896 743 } 744 ] 745 ], 746 "highest_compliance_questions": [ 747 [ 748 "evaluation_design.baselines_contemporary", 749 { 750 "applies_count": 507, 751 "applies_rate": 0.9037, 752 "yes_count": 399, 753 "compliance_rate": 0.787 754 } 755 ], 756 [ 757 "evaluation_design.failure_cases_discussed", 758 { 759 "applies_count": 555, 760 "applies_rate": 0.9893, 761 "yes_count": 460, 762 "compliance_rate": 0.8288 763 } 764 ], 765 [ 766 "evaluation_design.multiple_metrics", 767 { 768 "applies_count": 491, 769 "applies_rate": 0.8752, 770 "yes_count": 415, 771 "compliance_rate": 0.8452 772 } 773 ], 774 [ 775 "data_integrity.data_collection_described", 776 { 777 "applies_count": 541, 778 "applies_rate": 0.9643, 779 "yes_count": 469, 780 "compliance_rate": 0.8669 781 } 782 ], 783 [ 784 "evaluation_design.baselines_included", 785 { 786 "applies_count": 521, 787 "applies_rate": 0.9287, 788 "yes_count": 458, 789 "compliance_rate": 0.8791 790 } 791 ], 792 [ 793 "evaluation_design.negative_results_reported", 794 { 795 "applies_count": 545, 796 "applies_rate": 0.9715, 797 "yes_count": 482, 798 "compliance_rate": 0.8844 799 } 800 ], 801 [ 802 "evaluation_design.per_category_breakdown", 803 { 804 "applies_count": 539, 805 "applies_rate": 0.9608, 806 "yes_count": 481, 807 "compliance_rate": 0.8924 808 } 809 ], 810 [ 811 "claims_and_evidence.abstract_claims_supported", 812 { 813 "applies_count": 561, 814 "applies_rate": 1.0, 815 "yes_count": 513, 816 "compliance_rate": 0.9144 817 } 818 ], 819 [ 820 "setup_transparency.scaffolding_described", 821 { 822 "applies_count": 238, 823 "applies_rate": 0.4242, 824 "yes_count": 224, 825 "compliance_rate": 0.9412 826 } 827 ], 828 [ 829 "conflicts_of_interest.affiliations_disclosed", 830 { 831 "applies_count": 561, 832 "applies_rate": 1.0, 833 "yes_count": 555, 834 "compliance_rate": 0.9893 835 } 836 ] 837 ], 838 "methodology_tags": { 839 "benchmark-eval": 424, 840 "case-study": 115, 841 "qualitative": 68, 842 "theoretical": 62, 843 "observational": 50, 844 "meta-analysis": 31, 845 "rct": 7 846 }, 847 "claim_support": { 848 "moderate": 1243, 849 "strong": 1163, 850 "weak": 404, 851 "unsupported": 75 852 }, 853 "by_year": { 854 "2017": { 855 "n": 1, 856 "mean": 0.5278, 857 "median": 0.5278 858 }, 859 "2018": { 860 "n": 1, 861 "mean": 0.55, 862 "median": 0.55 863 }, 864 "2019": { 865 "n": 1, 866 "mean": 0.6471, 867 "median": 0.6471 868 }, 869 "2020": { 870 "n": 1, 871 "mean": 0.525, 872 "median": 0.525 873 }, 874 "2021": { 875 "n": 1, 876 "mean": 0.7073, 877 "median": 0.7073 878 }, 879 "2022": { 880 "n": 18, 881 "mean": 0.6239, 882 "median": 0.6585 883 }, 884 "2023": { 885 "n": 61, 886 "mean": 0.5068, 887 "median": 0.525 888 }, 889 "2024": { 890 "n": 129, 891 "mean": 0.5085, 892 "median": 0.5278 893 }, 894 "2025": { 895 "n": 282, 896 "mean": 0.4949, 897 "median": 0.518 898 }, 899 "2026": { 900 "n": 66, 901 "mean": 0.537, 902 "median": 0.5583 903 } 904 }, 905 "by_methodology_tag": { 906 "benchmark-eval": { 907 "n": 424, 908 "mean": 0.5355, 909 "median": 0.5385 910 }, 911 "case-study": { 912 "n": 115, 913 "mean": 0.4134, 914 "median": 0.4167 915 }, 916 "meta-analysis": { 917 "n": 31, 918 "mean": 0.408, 919 "median": 0.3514 920 }, 921 "observational": { 922 "n": 50, 923 "mean": 0.5767, 924 "median": 0.5973 925 }, 926 "qualitative": { 927 "n": 68, 928 "mean": 0.4469, 929 "median": 0.5 930 }, 931 "rct": { 932 "n": 7, 933 "mean": 0.5273, 934 "median": 0.4872 935 }, 936 "theoretical": { 937 "n": 62, 938 "mean": 0.5099, 939 "median": 0.5556 940 } 941 }, 942 "code_released_crosstab": { 943 "with_code": { 944 "n": 295, 945 "mean": 0.5372, 946 "median": 0.5476 947 }, 948 "without_code": { 949 "n": 252, 950 "mean": 0.4855, 951 "median": 0.5 952 } 953 }, 954 "code_vs_category_scores": { 955 "statistical_methodology": { 956 "with_code_mean": 0.3019, 957 "without_code_mean": 0.2317, 958 "diff": 0.0702 959 }, 960 "evaluation_design": { 961 "with_code_mean": 0.8314, 962 "without_code_mean": 0.7492, 963 "diff": 0.0823 964 }, 965 "claims_and_evidence": { 966 "with_code_mean": 0.6249, 967 "without_code_mean": 0.5106, 968 "diff": 0.1143 969 }, 970 "setup_transparency": { 971 "with_code_mean": 0.6973, 972 "without_code_mean": 0.5286, 973 "diff": 0.1687 974 }, 975 "limitations_and_scope": { 976 "with_code_mean": 0.5763, 977 "without_code_mean": 0.4683, 978 "diff": 0.108 979 }, 980 "data_integrity": { 981 "with_code_mean": 0.7931, 982 "without_code_mean": 0.501, 983 "diff": 0.2921 984 }, 985 "conflicts_of_interest": { 986 "with_code_mean": 0.4633, 987 "without_code_mean": 0.3595, 988 "diff": 0.1038 989 }, 990 "contamination": { 991 "with_code_mean": 0.2435, 992 "without_code_mean": 0.1284, 993 "diff": 0.1151 994 }, 995 "human_studies": { 996 "with_code_mean": 0.2713, 997 "without_code_mean": 0.238, 998 "diff": 0.0333 999 }, 1000 "cost_and_practicality": { 1001 "with_code_mean": 0.3138, 1002 "without_code_mean": 0.2037, 1003 "diff": 0.1101 1004 } 1005 }, 1006 "red_flags_distribution": { 1007 "No limitations section": 121, 1008 "No uncertainty quantification": 59, 1009 "No statistical significance tests": 50, 1010 "No statistical significance testing": 41, 1011 "Benchmark contamination risk unaddressed": 36, 1012 "No contamination analysis": 27, 1013 "Benchmark contamination not addressed": 24, 1014 "Contamination risk unaddressed": 19, 1015 "No variance or uncertainty quantification": 18, 1016 "Missing hyperparameters": 18, 1017 "No statistical uncertainty quantification": 17, 1018 "No statistical rigor": 15, 1019 "No funding disclosure": 14, 1020 "No ablation study": 14, 1021 "Company evaluating its own product": 13, 1022 "Company evaluating own product": 13, 1023 "No systematic review methodology": 12, 1024 "Benchmark contamination risk": 11, 1025 "No code or data release": 11, 1026 "No quantitative evaluation": 10 1027 }, 1028 "score_histogram": { 1029 "0-10%": 13, 1030 "10-20%": 18, 1031 "20-30%": 43, 1032 "30-40%": 51, 1033 "40-50%": 105, 1034 "50-60%": 156, 1035 "60-70%": 103, 1036 "70-80%": 53, 1037 "80-90%": 19 1038 } 1039 }