calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (17347B)
      1 {
      2   "paper_slug": "agentsnet-coordination-collaborative-2025",
      3   "calibration_date": "2026-02-28",
      4   "sonnet_scan_date": "2026-02-28",
      5   "agreement_rate": 0.94,
      6   "total_questions": 50,
      7   "agreements": 47,
      8   "disagreements": 3,
      9   "disagreement_details": [
     10     {
     11       "category": "statistical_methodology",
     12       "question": "significance_tests",
     13       "sonnet": {"applies": false, "answer": false},
     14       "opus": {"applies": true, "answer": false},
     15       "direction": "applies_boundary",
     16       "explanation": "The paper makes explicit comparative claims ('best performing models are Claude 3.7 Sonnet, Gemini 2.5 Pro, and Gemini 2.5 Flash', 'Gemini 2.5 Flash is roughly on par with Claude 3.7'). The schema states 'NA if the paper makes no comparative claims' — since comparative claims are present, significance tests apply. Sonnet treats benchmark evaluations as inherently non-comparative, but the paper does rank and compare models without statistical testing."
     17     },
     18     {
     19       "category": "statistical_methodology",
     20       "question": "effect_sizes_reported",
     21       "sonnet": {"applies": false, "answer": false},
     22       "opus": {"applies": true, "answer": false},
     23       "direction": "applies_boundary",
     24       "explanation": "Same reasoning as significance_tests: the paper makes comparative claims between models. Effect sizes (relative improvements, magnitude of differences) are applicable when claims of difference are made. While raw benchmark scores are reported, the paper never explicitly computes or frames relative differences as effect sizes. Applies because comparisons are made; answer is false because no formal effect sizes are provided."
     25     },
     26     {
     27       "category": "limitations_and_scope",
     28       "question": "threats_to_validity_specific",
     29       "sonnet": {"applies": true, "answer": false},
     30       "opus": {"applies": true, "answer": true},
     31       "direction": "opus_generous",
     32       "explanation": "Appendix G discusses specific threats beyond Section 6: (1) synchronous communication limits ecological validity, (2) binary metric may obscure partial progress in tasks with substantial near-correct solutions, (3) homogeneous agents do not capture heterogeneous real-world deployments, (4) no adversarial or faulty agents considered. These are specific to this study's design, not generic boilerplate. Sonnet focused on Section 6's narrower discussion of implementation choices and noted the absence of threats like graph size/topology bias, but Appendix G contains substantive, study-specific validity discussions."
     33     }
     34   ],
     35   "opus_checklist": {
     36     "artifacts": {
     37       "code_released": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "GitHub repository provided at https://github.com/floriangroetschla/AgentsNet (Section 5.1). Open-source code explicitly stated."
     41       },
     42       "data_released": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Dataset released on HuggingFace at https://huggingface.co/datasets/disco-eth/AgentsNet (Section 5.1)."
     46       },
     47       "environment_specified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper mentions LangChain and NetworkX as frameworks but provides no requirements.txt, Dockerfile, or detailed dependency version specifications."
     51       },
     52       "reproduction_instructions": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No step-by-step reproduction instructions in the paper. Code is on GitHub but no 'Reproducing Results' section or specific run commands are documented in the paper."
     56       }
     57     },
     58     "statistical_methodology": {
     59       "confidence_intervals_or_error_bars": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Table 2 reports standard error of the mean in parentheses (e.g., '0.14 (0.04)'). Figure 1 explicitly states 'Error bars indicate standard error of the mean.' Appendix C details the computation."
     63       },
     64       "significance_tests": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper makes comparative claims ('best performing models are Claude 3.7 Sonnet, Gemini 2.5 Pro, and Gemini 2.5 Flash'; 'Gemini 2.5 Flash is roughly on par with Claude 3.7') but uses no significance tests (p-values, t-tests, bootstrap, etc.) to support these comparisons."
     68       },
     69       "effect_sizes_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper reports raw benchmark scores but never explicitly computes or frames relative differences as effect sizes. Comparisons like 'roughly on par' and 'cheaper by a factor of 20' are made informally without formal effect size measures."
     73       },
     74       "sample_size_justified": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "27 network topologies with at least one repeat per graph (Section 5.1). No power analysis or justification for why this number of graphs and repeats is sufficient."
     78       },
     79       "variance_reported": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Standard error of the mean reported in Table 2 for all task-model combinations. The statistical methodology follows Miller [32] as described in Appendix C."
     83       }
     84     },
     85     "evaluation_design": {
     86       "baselines_included": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Ten frontier LLM models are compared against each other on the benchmark (Table 2), including Claude, GPT, Gemini, Llama 4, and o4-mini variants."
     90       },
     91       "baselines_contemporary": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "All evaluated models are 2024-2025 frontier models (Claude 3.5 Haiku, Claude 3.7 Sonnet, GPT-4.1 mini, Gemini 2.0/2.5 variants, Llama 4, o4-mini)."
     95       },
     96       "ablation_study": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No ablation study examines which components of the benchmark design matter (e.g., number of message-passing rounds, chain-of-thought prompting, graph topology effects). The scaling experiment (Section 5.3) increases graph size but does not ablate system components."
    100       },
    101       "multiple_metrics": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Binary (fully correct) evaluation is the main metric (Table 2), and soft evaluation scores providing continuous quality measures are reported in Appendix B / Table 4."
    105       },
    106       "human_evaluation": {
    107         "applies": false,
    108         "answer": false,
    109         "justification": "Human evaluation is irrelevant to this benchmark paper evaluating LLM agents on structured mathematical/graph problems with objective ground truth."
    110       },
    111       "held_out_test_set": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Graphs are procedurally generated and no model tuning is performed on them. The evaluation is zero-shot with no development/test split confound."
    115       },
    116       "per_category_breakdown": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Figure 4 provides per-task and per-graph-size breakdown. Table 2 provides per-task scores. Both breakdowns complement the aggregate AGENTSNET score."
    120       },
    121       "failure_cases_discussed": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 5.4 presents three key findings about failure modes: strategy coordination failures, agents accepting erroneous information, and conflict resolution behaviors. Appendix E provides detailed transcript examples."
    125       },
    126       "negative_results_reported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Performance drops to near zero for 100-agent networks (Figure 5). Many models perform poorly on VERTEXCOVER and COLORING. These negative results are reported candidly."
    130       }
    131     },
    132     "claims_and_evidence": {
    133       "abstract_claims_supported": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Abstract claims are well-supported: 'strong performance for small networks but fall off as network scales' (Table 2, Figures 4-5); '100 agents' (Figure 5); 'existing benchmarks cover at most 2-5 agents' (Section 2, Related Work)."
    137       },
    138       "causal_claims_justified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "The paper is a benchmark evaluation that reports performance scores. No causal claims are made about the authors' own system — statements describe observed performance patterns (e.g., 'performance drops') rather than asserting causal mechanisms."
    142       },
    143       "generalization_bounded": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Claims are appropriately scoped to specific models tested on the AGENTSNET benchmark. Results are presented as performance on specific tasks and graph sizes, not generalized to all multi-agent coordination."
    147       },
    148       "alternative_explanations_discussed": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Section 5.4 describes failure patterns qualitatively but does not systematically discuss alternative explanations for performance differences between models (e.g., context length limitations vs. reasoning capability vs. instruction-following quality)."
    152       }
    153     },
    154     "setup_transparency": {
    155       "model_versions_specified": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Table 3 in Appendix A lists exact model versions: claude-3-5-haiku-20241022, claude-3-7-sonnet-20250219, gpt-4.1-mini, gemini-2.5-flash-preview-04-17, etc. These are the actual API model identifiers used."
    159       },
    160       "prompts_provided": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Full system prompt template provided in Appendix A with all template variables. Task-specific prompt texts ([task1] and [task2]) for all five tasks provided in Appendix B. Template variables are deterministic from the graph instance, and the dataset is released, enabling full prompt reconstruction."
    164       },
    165       "hyperparameters_reported": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No LLM API hyperparameters (temperature, top-p, max tokens) are reported anywhere in the paper or appendices."
    169       },
    170       "scaffolding_described": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 4 and Algorithm 1 describe the message-passing protocol in detail: synchronous round structure, JSON message formatting, retry logic for invalid JSON, chain-of-thought prompting option."
    174       },
    175       "data_preprocessing_documented": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5.1 describes graph generation: 27 topologies from 3 graph models (small-world, scale-free, Delaunay) x 3 sizes (4, 8, 16) x 3 instances. Appendix D details the graph generation models."
    179       }
    180     },
    181     "limitations_and_scope": {
    182       "limitations_section_present": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 6 is titled 'Limitations' and Appendix G provides an extended 'Limitations' discussion covering synchronous communication, binary metrics, agent homogeneity, scalability, and adversarial settings."
    186       },
    187       "threats_to_validity_specific": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix G discusses specific threats: (1) synchronous communication model limits ecological validity for real-world transfer, (2) binary metric may obscure partial progress, (3) homogeneous agents miss heterogeneous deployment challenges, (4) no adversarial/faulty agents tested. These are specific to this study's design choices."
    191       },
    192       "scope_boundaries_stated": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "While Appendix G identifies specific design limitations, the paper does not explicitly state what claims the results do NOT support or what the benchmark does NOT measure about multi-agent coordination capability more broadly."
    196       }
    197     },
    198     "data_integrity": {
    199       "raw_data_available": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Dataset released on HuggingFace at https://huggingface.co/datasets/disco-eth/AgentsNet, allowing independent verification of benchmark instances."
    203       },
    204       "data_collection_described": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Graph generation procedurally described in Section 5.1 and Appendix D: three graph models (Watts-Strogatz, Barabasi-Albert, Delaunay), three sizes, three instances each."
    208       },
    209       "recruitment_methods_described": {
    210         "applies": false,
    211         "answer": false,
    212         "justification": "No human participants. Study uses procedurally generated graphs and LLM API calls."
    213       },
    214       "data_pipeline_documented": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Full pipeline documented: graph generation (Section 5.1, Appendix D), agent instantiation and message-passing (Section 4, Algorithm 1), response extraction and scoring (Section 3, Appendix C)."
    218       }
    219     },
    220     "conflicts_of_interest": {
    221       "funding_disclosed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No acknowledgments or funding disclosure section in the paper. Two authors are affiliated with Google Research."
    225       },
    226       "affiliations_disclosed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Author affiliations listed on the title page: ETH Zurich, RWTH Aachen University, and Google Research."
    230       },
    231       "funder_independent_of_outcome": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "Two of five authors are from Google Research, which develops Gemini models — four of ten evaluated models. This represents a non-independent relationship. No conflict acknowledgment is made."
    235       },
    236       "financial_interests_declared": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No competing interests statement in the paper. The Google Research affiliation and potential interest in Gemini model performance is not explicitly declared as a conflict."
    240       }
    241     },
    242     "contamination": {
    243       "training_cutoff_stated": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The paper evaluates LLM models on the benchmark but does not state training data cutoff dates for any evaluated model."
    247       },
    248       "train_test_overlap_discussed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No discussion of whether benchmark tasks or graph structures could appear in training data. The procedural generation makes exact instance contamination unlikely, but the underlying problem types (graph coloring, leader election, etc.) are classical and well-studied."
    252       },
    253       "benchmark_contamination_addressed": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "The benchmark is novel (2025), but classical distributed computing problems (graph coloring, vertex cover, etc.) are extensively documented in textbooks and papers likely in training data. The paper does not address whether familiarity with these problem types affects performance."
    257       }
    258     },
    259     "human_studies": {
    260       "pre_registered": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this benchmark evaluation paper."
    264       },
    265       "irb_or_ethics_approval": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this benchmark evaluation paper."
    269       },
    270       "demographics_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this benchmark evaluation paper."
    274       },
    275       "inclusion_exclusion_criteria": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this benchmark evaluation paper."
    279       },
    280       "randomization_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this benchmark evaluation paper."
    284       },
    285       "blinding_described": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in this benchmark evaluation paper."
    289       },
    290       "attrition_reported": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No human participants in this benchmark evaluation paper."
    294       }
    295     },
    296     "cost_and_practicality": {
    297       "inference_cost_reported": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Figure 1 plots model performance versus API cost per repeat in USD (as of May 15, 2025), ranging from ~$1 to ~$200 per repeat. Pareto-optimal models identified."
    301       },
    302       "compute_budget_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Per-repeat API costs shown in Figure 1, but total computational budget (total number of API calls, total cost across all experiments) is not explicitly stated."
    306       }
    307     }
    308   }
    309 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs