calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (20126B)
      1 {
      2   "paper_slug": "2025-ai-agent-2026",
      3   "total_questions": 50,
      4   "agreement_count": 48,
      5   "disagreement_count": 2,
      6   "agreement_rate": 0.96,
      7   "disagreements": [
      8     {
      9       "category": "setup_transparency",
     10       "question": "model_versions_specified",
     11       "sonnet_applies": true,
     12       "sonnet_answer": true,
     13       "opus_applies": true,
     14       "opus_answer": false,
     15       "opus_justification": "The paper specifies 'ChatGPT 5.2 with deep research', 'Claude Sonnet 4.5 with research mode', 'Gemini 2.5 with research mode', and 'OpenAI GPT-5.2 with web search'. These are marketing/product names without snapshot dates or API version identifiers. The schema explicitly states: 'Marketing names like Gemini-2.5 or GPT-4o without a snapshot date or API version do NOT count as specified versions.' None of these names include a snapshot date (e.g., 'gpt-5.2-2025-11-15') or API version, so the criterion is not satisfied.",
     16       "sonnet_justification": "The paper specifies the exact model versions used for LLM assistance: 'ChatGPT 5.2 with deep research', 'Claude Sonnet 4.5 with research mode', 'Gemini 2.5 with research mode', 'OpenAI GPT-5.2 with web search' (Generative AI Usage Statement, Section B.5, B.6).",
     17       "direction": "sonnet_generous"
     18     },
     19     {
     20       "category": "setup_transparency",
     21       "question": "hyperparameters_reported",
     22       "sonnet_applies": false,
     23       "sonnet_answer": false,
     24       "opus_applies": true,
     25       "opus_answer": false,
     26       "opus_justification": "The paper uses LLM APIs (GPT-5.2, Claude Sonnet 4.5, Gemini 2.5) as part of its documented methodology pipeline for candidate discovery (Section B.5), annotation verification (Section B.6), and search term generation (Section C.1). The schema states: 'If the paper uses an LLM API without stating temperature/sampling settings, NO.' The paper does not report temperature, top-p, or any other sampling parameters for any of these LLM calls. While the LLM outputs are screened by humans, they are still part of the reproducible methodology and hyperparameters should be reported.",
     27       "sonnet_justification": "LLMs are used for candidate surfacing and annotation verification, not for any measurements that depend on sampling parameters. The outputs are screened by humans, so temperature/sampling settings are not material to the results.",
     28       "direction": "applies_boundary"
     29     }
     30   ],
     31   "opus_checklist": {
     32     "artifacts": {
     33       "code_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No code repository URL is provided in the paper. The annotation data (JSON and CSV) is released on Zenodo (https://doi.org/10.5281/zenodo.18701931), but no analysis scripts, visualization code, or annotation tools are released."
     37       },
     38       "data_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The full Index annotations for all 30 agents across 45 fields are released in JSON and CSV format on Zenodo (https://doi.org/10.5281/zenodo.18701931). The online Index is also available at https://aiagentindex.mit.edu."
     42       },
     43       "environment_specified": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "This is a documentation/index paper with no computational experiments requiring a reproducible software environment. LLMs were used via commercial APIs (ChatGPT, Claude, Gemini) rather than local installations."
     47       },
     48       "reproduction_instructions": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "While the annotation methodology, inclusion criteria, and annotation guide are described in detail (Sections 3, B.2, B.4), there are no step-by-step instructions for reproducing the full index from scratch. The LLM-assisted steps used specific commercial models and prompts, but the overall process relies on expert judgment not fully codified."
     52       }
     53     },
     54     "statistical_methodology": {
     55       "confidence_intervals_or_error_bars": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "This is a descriptive documentation paper reporting counts and proportions (e.g., '15/30 agents reference AI safety frameworks'). No inferential statistics are used; confidence intervals are not applicable to this paper type."
     59       },
     60       "significance_tests": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "The paper reports descriptive counts and proportions only. It makes no comparative statistical claims that would require significance testing."
     64       },
     65       "effect_sizes_reported": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No effect sizes are relevant. The paper documents categorical presence/absence of features across 30 agents."
     69       },
     70       "sample_size_justified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper indexes 30 agents selected via inclusion criteria (Section 3.1) but does not formally justify why 30 is sufficient or discuss whether additional agents would change the findings. The selection is criteria-driven but no power analysis or sample adequacy discussion is provided."
     74       },
     75       "variance_reported": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is a single-pass annotation study. There are no repeated measurements or experimental runs requiring variance reporting. Inter-annotator agreement is reported as a count (37/1,350 discrepancies) rather than a variance metric."
     79       }
     80     },
     81     "evaluation_design": {
     82       "baselines_included": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper explicitly builds on and compares to the 2024 AI Agent Index (Casper et al., [22]). Figure 13 shows changes in annotation fields between the 2024 and 2025 versions. The Princeton Holistic Agent Leaderboard and AIAgentList.com are also referenced as contemporary alternatives."
     86       },
     87       "baselines_contemporary": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The 2024 AI Agent Index is the direct predecessor from the same research group. The Princeton Holistic Agent Leaderboard [67] and AIAgentList.com [4] are also cited as contemporary comparison points."
     91       },
     92       "ablation_study": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "This is a documentation/index paper with no system components to ablate. There is no model or algorithm whose components could be individually tested."
     96       },
     97       "multiple_metrics": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper uses 45 annotation fields across 6 categories (product overview, company, technical, autonomy, ecosystem, safety) to characterize agents, providing a multi-dimensional documentation framework."
    101       },
    102       "human_evaluation": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "The paper IS the human annotation product. There is no system output for humans to evaluate; the paper documents publicly available information about AI agents through expert annotation."
    106       },
    107       "held_out_test_set": {
    108         "applies": false,
    109         "answer": false,
    110         "justification": "No train/test split is applicable. This is a documentation paper that indexes the full population of qualifying agents as of December 31, 2025."
    111       },
    112       "per_category_breakdown": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Findings are systematically broken down by agent category (Chat, Browser, Enterprise) across all major dimensions. Section 4 and Figures 3, 5, 6, 12 provide per-category analyses. For example, browser agents have 64% missing safety fields vs. 43% for chat agents."
    116       },
    117       "failure_cases_discussed": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper extensively documents transparency gaps, missing safety information (133/240 fields), security incidents (prompt injection in Perplexity Comet, Section 5.2), and areas where documentation is lacking across the ecosystem."
    121       },
    122       "negative_results_reported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper prominently reports what was NOT found: 133/240 safety fields have no information (Figure 6), 25/30 agents disclose no internal safety results, 23/30 have no third-party testing, only 4/30 companies provided substantive responses to outreach."
    126       }
    127     },
    128     "claims_and_evidence": {
    129       "abstract_claims_supported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract claims about documenting 30 agents, finding different transparency levels, and observing that most developers share little about safety are directly supported by Section 4.6 and Figure 6 (133/240 safety fields with no information)."
    133       },
    134       "causal_claims_justified": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper is descriptive and makes no causal claims. Statements like 'this suggests the need for more information' are normative recommendations, not causal assertions. No language of 'improves', 'causes', or 'leads to' is used to describe empirical findings."
    138       },
    139       "generalization_bounded": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper explicitly bounds its scope: 'Our analysis focuses on agentic systems that are publicly available, deployable with minimal configuration, and general-purpose' (Section 6.2). Domain-specific agents, internal deployments, and non-English/Chinese documentation are explicitly acknowledged as excluded."
    143       },
    144       "alternative_explanations_discussed": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper discusses alternative explanations: Chinese companies' compliance 'may simply not be documented publicly' (Section 4.2); lower developer response rates attributed to narrower inclusion criteria (footnote 4); distinction between 'None found' and 'None' throughout acknowledges that absence of public information does not equal absence of practice."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The paper names 'ChatGPT 5.2 with deep research', 'Claude Sonnet 4.5 with research mode', 'Gemini 2.5 with research mode', and 'OpenAI GPT-5.2 with web search' (Generative AI Usage Statement, Sections B.5-B.6). However, the schema explicitly states that marketing names without a snapshot date or API version do NOT count. None of these include snapshot dates (e.g., 'gpt-5.2-2025-11-15') or API version identifiers."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Full prompts are provided verbatim in Section B.5 (agent discovery prompts for ChatGPT 5.2, Claude Sonnet 4.5, Gemini 2.5), Section B.6 (annotation verification prompts in Listings 1-3), and Section C.1 (search term generation prompts). These are the actual prompt texts, not summaries."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The paper uses LLM APIs (GPT-5.2, Claude Sonnet 4.5, Gemini 2.5) as part of its methodology for candidate discovery (Section B.5), annotation verification (Section B.6), and search term generation (Section C.1). No temperature, top-p, or other sampling parameters are reported for any of these API calls. Per the schema: 'If the paper uses an LLM API without stating temperature/sampling settings, NO.'"
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "The paper does not use agentic scaffolding. LLMs are used as research assistance tools with human review at every stage, not as autonomous agents with tool access, retry logic, or memory management."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The full pipeline is documented: LLM queries surfaced 95 candidate agents (Section 3.3) -> screened against inclusion criteria (Section 3.1) -> ambiguous cases included for in-depth annotation -> final inclusion decisions after full evaluation -> 7 experts annotated by field -> independent review -> 37/1,350 discrepancies resolved -> GPT-5.2 verification -> company corrections (Section 3.4). Filtering criteria at each stage are stated."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6.2 'Limitations and Outlook' provides a dedicated, substantive discussion of the paper's methodological limitations spanning multiple paragraphs."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 6.2 discusses specific threats: inclusion criteria favor significant agents affecting generalizability; public interest metrics favor consumer over enterprise products; reliance on English and Chinese documentation; snapshot as of December 31, 2025; only 23% developer response rate; possible missed agents or inaccuracies despite vetting."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The paper explicitly states what it does NOT cover: domain-specific agents excluded (Section 3.1), company-internal deployments excluded, no experimental testing performed ('We did not perform experimental testing', Section 3.4), systems behind closed doors remain opaque (Section 6.2). These are specific boundary statements, not generic disclaimers."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The full annotations (1,350 fields for 30 agents) are available in JSON and CSV format on Zenodo (https://doi.org/10.5281/zenodo.18701931), enabling independent verification of all documented claims."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 3.4 describes the annotation procedure: seven subject matter experts annotated based on public documentation, websites, demos, and governance documents. Each expert responsible for specific fields. Annotation guide detailed in Section B.4. All web sources archived."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "The paper indexes AI agent products, not human participants. The annotators are the paper's own authors (seven subject matter experts), not a recruited sample requiring recruitment methodology description."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The full pipeline is documented from start to finish: LLM queries -> 95 candidates -> inclusion criteria screening -> expert annotation by field category -> independent review by second annotator -> 37/1,350 discrepancies resolved through discussion -> GPT-5.2 automated verification -> company correction opportunity (4-week window). Stage counts and criteria provided."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "The Acknowledgments section states: 'This research was supported by the MATS Research program, which provided funding for L.S. and M.Y. through research stipends.' Additional organizational support from MATS and research manager Keivan Navaie is also disclosed."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "All author affiliations are listed on the first page: University of Cambridge, University of Washington, Harvard Law School, Stanford University, Concordia AI, University of Pennsylvania, MIT, Hebrew University of Jerusalem. The Claude Code sample entry notes 'No authors have conflicts of interest related to Anthropic or Claude Code.'"
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "MATS Research program is an AI safety research fellowship with no financial interest in particular AI agent products being rated more or less transparent. The funder does not benefit from any specific finding."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "There is no formal competing interests statement or declaration of financial interests (patents, equity, consulting relationships). The one-off mention about no conflicts related to Claude Code (Section A.2) does not constitute a systematic declaration covering all 30 agents evaluated."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "This is a documentation/index paper that does not evaluate pre-trained model capabilities on any benchmark. LLMs are used as research assistance tools, not for capability evaluation."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No benchmark evaluation is conducted. The paper documents publicly available information about AI agent products."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No benchmark evaluation is conducted. The paper documents agent features and transparency, not model capabilities on knowledge benchmarks."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "The paper indexes AI agent products, not human participants. The annotators are the paper's own authors conducting research, not experimental subjects."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants are studied. The paper analyzes publicly available documentation of AI products."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants. The annotators are the paper's authors, not a recruited sample."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participant inclusion/exclusion criteria apply. The inclusion criteria described are for AI agent products (Section 3.1), not human subjects."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants and no experimental assignment to conditions."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in an experimental sense. Annotators cannot be blinded to which agent they are annotating."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants. No attrition to report."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "This is a documentation/survey paper. The paper does not propose a method with per-example inference cost. LLMs are used as auxiliary research tools, not as the paper's primary method."
    297       },
    298       "compute_budget_stated": {
    299         "applies": false,
    300         "answer": false,
    301         "justification": "This is a documentation/index paper. Computational resources used for LLM-assisted steps are auxiliary to the primarily human annotation effort."
    302       }
    303     }
    304   }
    305 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs