calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (20229B)
      1 {
      2   "paper_slug": "agentasajudge-evaluate-agents-2024",
      3   "calibration_date": "2026-02-28",
      4   "calibration_model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 45,
      7   "disagreement_count": 5,
      8   "agreement_rate": 0.90,
      9   "disagreements": [
     10     {
     11       "category": "human_studies",
     12       "question": "pre_registered",
     13       "sonnet": {"applies": true, "answer": false},
     14       "opus": {"applies": false, "answer": false},
     15       "direction": "applies_boundary",
     16       "explanation": "The three human evaluators are co-authors performing annotation work as part of their research contribution, not external human participants in a human subjects study. Pre-registration requirements apply to studies with recruited human participants, not co-author annotators. The paper's 'Human-as-a-Judge' analysis examines inter-rater reliability of their own team's annotations, which is a standard methodological practice, not a study ON human subjects."
     17     },
     18     {
     19       "category": "human_studies",
     20       "question": "irb_or_ethics_approval",
     21       "sonnet": {"applies": true, "answer": false},
     22       "opus": {"applies": false, "answer": false},
     23       "direction": "applies_boundary",
     24       "explanation": "IRB/ethics approval is required for research conducted ON human subjects. The evaluators here are three co-authors performing research tasks in their capacity as researchers. Their disagreement analysis is a methodological reflection, not a study with recruited human participants. Standard CS research practice does not require IRB review for co-author annotation work."
     25     },
     26     {
     27       "category": "human_studies",
     28       "question": "demographics_reported",
     29       "sonnet": {"applies": true, "answer": false},
     30       "opus": {"applies": false, "answer": false},
     31       "direction": "applies_boundary",
     32       "explanation": "Demographics reporting applies when there are recruited human participants whose characteristics could affect generalizability. The evaluators here are three identified co-authors ('AI experts from the authors') performing a specific methodological role. Their demographic characteristics are not relevant in the same way as external participant demographics in a user study."
     33     },
     34     {
     35       "category": "human_studies",
     36       "question": "inclusion_exclusion_criteria",
     37       "sonnet": {"applies": true, "answer": false},
     38       "opus": {"applies": false, "answer": false},
     39       "direction": "applies_boundary",
     40       "explanation": "Inclusion/exclusion criteria apply to studies that recruit participants from a population. The evaluators are three self-selected co-authors, not participants recruited from a pool. There is no 'recruitment' in the meaningful sense — they are researchers contributing to their own paper."
     41     },
     42     {
     43       "category": "human_studies",
     44       "question": "blinding_described",
     45       "sonnet": {"applies": true, "answer": false},
     46       "opus": {"applies": false, "answer": false},
     47       "direction": "applies_boundary",
     48       "explanation": "Blinding applies to experimental studies with human participants where knowledge of conditions could bias outcomes. The evaluators are co-author annotators performing a methodological validation role, not participants in a controlled experiment. While blinding WOULD have strengthened the evaluation design, the question is about applicability to this paper type, and co-author annotation work does not structurally require blinding in the human_studies sense."
     49     }
     50   ],
     51   "opus_checklist": {
     52     "artifacts": {
     53       "code_released": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "GitHub link provided: https://github.com/metauto-ai/agent-as-a-judge, listed in the paper header. This is a working URL to the project repository."
     57       },
     58       "data_released": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "DevAI dataset released on HuggingFace: https://huggingface.co/devai-benchmark, stated in the paper header. Dataset is publicly available."
     62       },
     63       "environment_specified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is provided in the paper. Dependencies are visible in trajectory samples but no formal environment specification is given."
     67       },
     68       "reproduction_instructions": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No step-by-step reproduction instructions in the paper. A GitHub link exists but the paper itself contains no 'Reproducing Results' section or commands to replicate the experiments."
     72       }
     73     },
     74     "statistical_methodology": {
     75       "confidence_intervals_or_error_bars": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Results in Tables 2-4 are reported as point estimates (e.g., '90.44% alignment rate') with no confidence intervals, error bars, or uncertainty quantification."
     79       },
     80       "significance_tests": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper claims Agent-as-a-Judge outperforms LLM-as-a-Judge based on comparing alignment rate percentages (e.g., 90.44% vs 60.38%) without any statistical significance tests."
     84       },
     85       "effect_sizes_reported": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Raw alignment rates are reported side by side in Table 3 but no standardized effect sizes (Cohen's d, odds ratios) are computed. The paper reports absolute differences without formal effect size calculation."
     89       },
     90       "sample_size_justified": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The benchmark has only 55 tasks and 365 requirements. No power analysis or justification for why 55 tasks is sufficient to support the generalization claims made."
     94       },
     95       "variance_reported": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All results appear to be single-run experiments. No standard deviation, variance across runs, or spread measures are reported."
     99       }
    100     },
    101     "evaluation_design": {
    102       "baselines_included": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "LLM-as-a-Judge is compared as a direct baseline (Table 3). Human-as-a-Judge serves as the gold standard. Three developer agents (MetaGPT, GPT-Pilot, OpenHands) are benchmarked."
    106       },
    107       "baselines_contemporary": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "LLM-as-a-Judge (Zheng et al., 2024) is contemporary. The three developer agents are all current open-source frameworks with >30K GitHub stars. GPT-4o-2024-05-13 is used as the backend."
    111       },
    112       "ablation_study": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 4.3 and Appendix K present extensive ablation studies with 8 components (ask, graph, read, locate, retrieve, search, planning, memory). Tables 4-7 show progressive component addition results."
    116       },
    117       "multiple_metrics": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Multiple metrics used: Alignment Rate, Judge Shift, PR Curves, Requirements Met (independent and dependency-aware), Task Solve Rate, Self-Termination rate."
    121       },
    122       "human_evaluation": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Three expert human evaluators performed manual evaluation over 86.5 total hours across two rounds including consensus-building (Section 3). This serves as the ground truth."
    126       },
    127       "held_out_test_set": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "All 55 DevAI tasks are used for both ablation studies (component selection) and final performance reporting. No train/test split. The same data used to select the best Agent-as-a-Judge configuration is used to report its final performance."
    131       },
    132       "per_category_breakdown": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "Results are broken down by developer agent (MetaGPT, GPT-Pilot, OpenHands) but not by task category, despite the benchmark defining categories (supervised learning, RL, CV, NLP, etc.) in Figure 2."
    136       },
    137       "failure_cases_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Appendix F discusses qualitative failure modes of developer agents. Section 4.1 discusses Agent-as-a-Judge failure modes (planning instability, memory chain errors). Ablation results show where components hurt performance."
    141       },
    142       "negative_results_reported": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Ablation studies (Section 4.3, Appendix K) show that search, planning, and memory components actually hurt performance relative to configurations without them. These negative results are explicitly reported and analyzed."
    146       }
    147     },
    148     "claims_and_evidence": {
    149       "abstract_claims_supported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Abstract claims that Agent-as-a-Judge 'dramatically outperforms LLM-as-a-Judge and is as reliable as our human evaluation baseline' are supported by Table 3 showing ~90% alignment for Agent-as-a-Judge vs ~60-70% for LLM-as-a-Judge, with individual human evaluators averaging ~85-89%."
    153       },
    154       "causal_claims_justified": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Ablation studies make causal claims (e.g., 'adding the graph component increases performance to 75.95%'). The sequential addition design does not isolate component interactions. Ablations are run only on OpenHands, not validated across all three developer agents."
    158       },
    159       "generalization_bounded": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper frames Agent-as-a-Judge as a general framework ('Evaluate Agents with Agents') but validates it only on 55 AI development code-generation tasks. The abstract and conclusion make broad claims ('concrete step forward for modern agentic systems') without bounding results to this narrow setting."
    163       },
    164       "alternative_explanations_discussed": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "No discussion of alternative explanations for Agent-as-a-Judge's superior performance. The key confound — Agent-as-a-Judge has workspace/file access while LLM-as-a-Judge does not — is partially noted via gray-box vs black-box comparison but not systematically disentangled. Whether a more capable LLM without agentic features would achieve similar results is not explored."
    168       }
    169     },
    170     "setup_transparency": {
    171       "model_versions_specified": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Paper specifies 'gpt-4o-2024-05-13' as the backend LLM (Section 2.3, Table 1, trajectory samples). This is a specific model version with a date identifier."
    175       },
    176       "prompts_provided": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Appendix L provides actual prompt text for the Agent-as-a-Judge system: system prompt (L.1), locate module (L.2), retrieve module (L.3), ask module (L.4), locate user prompt (L.5). Templates use placeholders ({criteria}, {evidence}) but the DevAI dataset is released, and Appendix M shows sample evidence, enabling reconstruction."
    180       },
    181       "hyperparameters_reported": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for the LLM calls made by Agent-as-a-Judge or the developer agents."
    185       },
    186       "scaffolding_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4.1 and Appendix C describe the Agent-as-a-Judge scaffold in detail: eight modular components (graph, locate, read, search, retrieve, ask, memory, planning) with their roles. Figures 6 and 9 show pipeline diagrams."
    190       },
    191       "data_preprocessing_documented": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Appendix E documents the DevAI creation pipeline: manually drafted queries (E.1), setting judging criteria (E.2), building dependency DAGs (E.3), refinement rounds (E.4), dataset analysis (E.5), auxiliary information (E.6). Filtering criteria at each stage are described."
    195       }
    196     },
    197     "limitations_and_scope": {
    198       "limitations_section_present": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No dedicated limitations or threats-to-validity section exists. Some limitations are mentioned in passing (e.g., 'a perfect Agent-as-a-Judge is not the focus of this proof of concept') but there is no labeled section with substantive limitation discussion."
    202       },
    203       "threats_to_validity_specific": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No systematic discussion of specific threats to validity. The paper briefly notes limitations like the benchmark being small-scale but does not enumerate specific threats to the validity of its conclusions."
    207       },
    208       "scope_boundaries_stated": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No explicit scope boundaries stated. The paper evaluates 55 AI development tasks in Python but does not explicitly state what domains, task types, or agent types the results do NOT generalize to."
    212       }
    213     },
    214     "data_integrity": {
    215       "raw_data_available": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "DevAI dataset released on HuggingFace (https://huggingface.co/devai-benchmark). Appendix E.7 shows the JSON format. Tasks, requirements, and preferences are publicly available."
    219       },
    220       "data_collection_described": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Appendix E documents the data collection process in detail: expert annotators manually crafted queries targeting AI development tasks, set binary requirements, built dependency DAGs, and performed two rounds of review and refinement."
    224       },
    225       "recruitment_methods_described": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Appendix H states 'We recruited three AI experts from the authors to perform human evaluation.' The evaluators are co-authors, introducing a conflict of interest. Their selection process and qualifications beyond 'AI experts' are not described."
    229       },
    230       "data_pipeline_documented": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Pipeline from task creation to final evaluation is documented: draft queries -> set requirements -> build dependency DAGs -> refinement rounds -> execute with developer agents (1800-second time limit) -> collect workspaces and trajectories."
    234       }
    235     },
    236     "conflicts_of_interest": {
    237       "funding_disclosed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Acknowledgements section discloses funding from KAUST Center of Excellence for Generative AI under award number 5940 and the SDAIA-KAUST Center of Excellence in Data Science and Artificial Intelligence."
    241       },
    242       "affiliations_disclosed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Author affiliations clearly stated: Meta AI and KAUST. The paper uses GPT-4o (OpenAI), not Meta or KAUST products, so no direct product conflict."
    246       },
    247       "funder_independent_of_outcome": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "KAUST is an academic institution with no financial stake in Agent-as-a-Judge outperforming LLM-as-a-Judge or in the specific developer agents evaluated."
    251       },
    252       "financial_interests_declared": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No competing interests statement or declaration of financial interests (patents, equity, startups) is present in the paper. Absence of disclosure is not the same as absence of conflict."
    256       }
    257     },
    258     "contamination": {
    259       "training_cutoff_stated": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "The paper uses gpt-4o-2024-05-13 as the backend but does not state its training data cutoff date. The DevAI benchmark is novel but no confirmation that it post-dates the training cutoff is provided."
    263       },
    264       "train_test_overlap_discussed": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No discussion of whether gpt-4o's training data could overlap with tasks similar to those in DevAI. The benchmark is hand-crafted but overlap risk is not formally addressed."
    268       },
    269       "benchmark_contamination_addressed": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "DevAI is a newly created benchmark, likely created after gpt-4o's training cutoff, but this is not explicitly confirmed. No analysis of contamination risk is provided."
    273       }
    274     },
    275     "human_studies": {
    276       "pre_registered": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "The three human evaluators are co-authors performing annotation work as part of their research contribution, not external human participants in a human subjects study. Pre-registration requirements are structurally inapplicable to co-author annotation work."
    280       },
    281       "irb_or_ethics_approval": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "The evaluators are co-author researchers, not recruited human participants. IRB review applies to research conducted on human subjects, not to co-authors performing their research roles."
    285       },
    286       "demographics_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Demographics reporting applies to recruited human participants whose characteristics could affect generalizability. Co-author annotators performing a methodological role do not constitute human participants requiring demographic characterization."
    290       },
    291       "inclusion_exclusion_criteria": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "Inclusion/exclusion criteria apply to studies recruiting participants from a population. The evaluators are self-selected co-authors contributing to their own paper, not recruited from a participant pool."
    295       },
    296       "randomization_described": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "All evaluators assessed all outputs in a repeated-measures design. Randomization of participant assignment is not applicable, and the evaluators are co-authors, not participants in a human subjects study."
    300       },
    301       "blinding_described": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "Co-author annotators performing a research methodology role do not constitute human participants in a controlled experiment requiring blinding. While blinding would have strengthened the evaluation, the human_studies blinding criterion is structurally inapplicable to co-author annotation work."
    305       },
    306       "attrition_reported": {
    307         "applies": false,
    308         "answer": false,
    309         "justification": "All three co-author evaluators completed both rounds. Attrition reporting applies to studies with recruited human participants, not co-author annotation work."
    310       }
    311     },
    312     "cost_and_practicality": {
    313       "inference_cost_reported": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Section 4.4 reports Agent-as-a-Judge cost $30.58 in API calls and took 118.43 minutes, vs Human-as-a-Judge ~$1,297.50 and 86.5 hours. Table 1 reports per-agent costs ($1.19, $3.92, $6.38)."
    317       },
    318       "compute_budget_stated": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Section 2.3: 'a full evaluation on DevAI with one of these three took around 210.65 USD and 14 hours.' Section 4.4 reports compute costs for all major experimental components."
    322       }
    323     }
    324   }
    325 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs