calibration.json (18343B)
1 { 2 "calibration_metadata": { 3 "paper_slug": "agentmesh-cooperative-multiagent-2025", 4 "calibration_date": "2026-02-28", 5 "sonnet_scan_date": "2026-02-28", 6 "calibrator": "opus", 7 "agreement_rate": 0.88, 8 "total_questions": 50, 9 "agreements": 44, 10 "disagreements": 6 11 }, 12 "disagreements": [ 13 { 14 "category": "evaluation_design", 15 "question": "baselines_contemporary", 16 "sonnet": {"applies": false, "answer": false}, 17 "opus": {"applies": true, "answer": false}, 18 "direction": "applies_boundary", 19 "explanation": "Sonnet set applies=false because no formal baselines are included. Opus sets applies=true because the paper does make informal comparative claims (multi-agent vs single-agent GPT-4) and could have included contemporary baselines like ChatDev or MetaGPT. Per scan-agent instructions: 'Do NOT set applies=false when a paper is weak in an area.' The absence of baselines is a weakness, not structural inapplicability." 20 }, 21 { 22 "category": "evaluation_design", 23 "question": "multiple_metrics", 24 "sonnet": {"applies": false, "answer": false}, 25 "opus": {"applies": true, "answer": false}, 26 "direction": "applies_boundary", 27 "explanation": "Sonnet set applies=false because no quantitative metrics are reported. Opus sets applies=true because the paper presents a system that generates code and could have measured success rates, code quality metrics, token usage, etc. The paper chose not to report metrics, which is a weakness, not structural inapplicability. A case-study paper demonstrating a code generation system can and should report metrics." 28 }, 29 { 30 "category": "evaluation_design", 31 "question": "human_evaluation", 32 "sonnet": {"applies": false, "answer": false}, 33 "opus": {"applies": true, "answer": false}, 34 "direction": "applies_boundary", 35 "explanation": "Sonnet set applies=false reasoning it is a 'system demonstration' where human evaluation is not in scope. Opus sets applies=true because the paper claims the system generates working software, and human evaluation of code quality/correctness would be directly relevant to validating those claims. The schema says NA only 'if human evaluation is clearly irrelevant to the claims' — for a code generation system, it is clearly relevant." 36 }, 37 { 38 "category": "evaluation_design", 39 "question": "per_category_breakdown", 40 "sonnet": {"applies": false, "answer": false}, 41 "opus": {"applies": true, "answer": false}, 42 "direction": "applies_boundary", 43 "explanation": "Sonnet set applies=false because no quantitative evaluation across categories is performed. Opus sets applies=true because the paper tests on multiple tasks (to-do app, REST API, 2048 game) and could have provided per-task breakdowns of success/failure. The absence of breakdown is a methodological weakness, not structural inapplicability." 44 }, 45 { 46 "category": "contamination", 47 "question": "training_cutoff_stated", 48 "sonnet": {"applies": true, "answer": false}, 49 "opus": {"applies": false, "answer": false}, 50 "direction": "applies_boundary", 51 "explanation": "Sonnet set applies=true because GPT-4 is used and its training cutoff matters for the case study tasks. Opus sets applies=false because the schema's NA rule says 'NA if the paper does not evaluate a pre-trained model's capability on any benchmark.' The paper uses GPT-4 within a framework demonstration on self-constructed example tasks, not on a recognized benchmark. This is a genuine boundary case — the to-do app case study implicitly tests GPT-4's coding capability but is not a formal benchmark evaluation." 52 }, 53 { 54 "category": "contamination", 55 "question": "train_test_overlap_discussed", 56 "sonnet": {"applies": true, "answer": false}, 57 "opus": {"applies": false, "answer": false}, 58 "direction": "applies_boundary", 59 "explanation": "Same reasoning as training_cutoff_stated. Sonnet treated the case study as implicitly evaluating GPT-4's capability, making contamination relevant. Opus applied the schema's NA rule strictly: self-constructed example tasks are not a benchmark, so contamination questions do not apply. Both positions are defensible." 60 } 61 ], 62 "opus_checklist": { 63 "artifacts": { 64 "code_released": { 65 "applies": true, 66 "answer": false, 67 "justification": "No repository URL or code archive is provided. The paper describes the implementation and provides pseudo-code snippets and prompt excerpts, but no link to working code." 68 }, 69 "data_released": { 70 "applies": false, 71 "answer": false, 72 "justification": "This is a system architecture paper with a case study demonstration. There is no collected dataset to release." 73 }, 74 "environment_specified": { 75 "applies": true, 76 "answer": false, 77 "justification": "The paper mentions 'Python-based framework' and 'OpenAI API (GPT-4 model)' but provides no requirements.txt, Dockerfile, or version specifications for any libraries." 78 }, 79 "reproduction_instructions": { 80 "applies": true, 81 "answer": false, 82 "justification": "No step-by-step reproduction instructions. Only architectural descriptions and simplified pseudo-code (Listing 1)." 83 } 84 }, 85 "statistical_methodology": { 86 "confidence_intervals_or_error_bars": { 87 "applies": false, 88 "answer": false, 89 "justification": "The paper explicitly states 'We did not perform a rigorous quantitative evaluation here.' No quantitative metrics are reported, making confidence intervals structurally inapplicable." 90 }, 91 "significance_tests": { 92 "applies": false, 93 "answer": false, 94 "justification": "No quantitative comparisons or numeric results are reported. Significance tests are structurally inapplicable." 95 }, 96 "effect_sizes_reported": { 97 "applies": false, 98 "answer": false, 99 "justification": "No quantitative results reported at all. Effect sizes are structurally inapplicable." 100 }, 101 "sample_size_justified": { 102 "applies": false, 103 "answer": false, 104 "justification": "Single case study plus two briefly mentioned examples. No statistical sample requiring justification." 105 }, 106 "variance_reported": { 107 "applies": false, 108 "answer": false, 109 "justification": "No quantitative results reported. Variance is structurally inapplicable." 110 } 111 }, 112 "evaluation_design": { 113 "baselines_included": { 114 "applies": true, 115 "answer": false, 116 "justification": "No systematic baseline comparison. The paper anecdotally mentions that 'asking GPT-4 to do the entire to-do app in one go often resulted in missing persistence,' but this is informal and unmeasured." 117 }, 118 "baselines_contemporary": { 119 "applies": true, 120 "answer": false, 121 "justification": "The paper makes comparative claims (multi-agent vs single-agent) and cites contemporary multi-agent frameworks (ChatDev, MetaGPT) but does not include them as formal baselines. Could and should have compared against these systems." 122 }, 123 "ablation_study": { 124 "applies": true, 125 "answer": false, 126 "justification": "The system has four components (Planner, Coder, Debugger, Reviewer). No ablation study removing any component is performed." 127 }, 128 "multiple_metrics": { 129 "applies": true, 130 "answer": false, 131 "justification": "No quantitative metrics are reported at all. The paper could have measured success rates, code quality, token usage, or time, but reports only qualitative narrative from a case study." 132 }, 133 "human_evaluation": { 134 "applies": true, 135 "answer": false, 136 "justification": "No human evaluation of the system's outputs. The paper claims the system generates working software, making human evaluation of code quality directly relevant. Only informal developer observation is described." 137 }, 138 "held_out_test_set": { 139 "applies": false, 140 "answer": false, 141 "justification": "No train/test split. The paper uses self-constructed example tasks as demonstrations, not a formal evaluation on held-out data." 142 }, 143 "per_category_breakdown": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper tests on multiple tasks (to-do app, REST API, 2048 game) but provides no per-task breakdown of results. Only the to-do app is described in detail; the other two are mentioned in a single sentence." 147 }, 148 "failure_cases_discussed": { 149 "applies": true, 150 "answer": true, 151 "justification": "The Discussion and Limitations section discusses specific failure modes: incomplete plans causing missing features, error propagation, Debugger getting stuck in loops, hallucinations, and context window limitations." 152 }, 153 "negative_results_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "The paper explicitly describes failures: 'we did encounter cases where the plan was incomplete, causing the final software to lack a feature,' and discusses hallucinations, debugging loops, and inability to guarantee correctness." 157 } 158 }, 159 "claims_and_evidence": { 160 "abstract_claims_supported": { 161 "applies": true, 162 "answer": true, 163 "justification": "The abstract claims are appropriately hedged: 'We propose AgentMesh,' 'A case study illustrates,' 'We discuss... limitations.' These claims are supported by the architecture description, case study walkthrough, and limitations section." 164 }, 165 "causal_claims_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper makes causal claims without controlled experiments: 'the multi-agent approach ensured that planning was done before coding (preventing the model from jumping straight into code),' 'the Debugger agent was invaluable in fixing syntax errors.' No ablation, no controlled comparison." 169 }, 170 "generalization_bounded": { 171 "applies": true, 172 "answer": false, 173 "justification": "The paper claims 'AgentMesh can handle multi-faceted tasks more robustly than a single prompt' and envisions systems that 'one day operate as a full-fledged software development team,' but only tested a single Python CLI to-do app in detail plus brief mentions of two other Python tasks. The title says 'Software Development Automation' generally." 174 }, 175 "alternative_explanations_discussed": { 176 "applies": true, 177 "answer": false, 178 "justification": "No alternative explanations considered. The benefit of multi-agent over single-agent could be due to more total tokens, different prompt engineering, API non-determinism, or task simplicity rather than the multi-agent architecture itself. None of these confounds are discussed." 179 } 180 }, 181 "setup_transparency": { 182 "model_versions_specified": { 183 "applies": true, 184 "answer": false, 185 "justification": "Specifies 'GPT-4' and mentions 'up to 8K or 32K tokens depending on version,' but does not provide an exact model version (e.g., 'gpt-4-0613') or snapshot date." 186 }, 187 "prompts_provided": { 188 "applies": true, 189 "answer": true, 190 "justification": "Actual prompt text is provided for PlannerAgent and DebuggerAgent including system and user messages in the 'Example Prompt Design' section. The case study shows what the placeholders were filled with." 191 }, 192 "hyperparameters_reported": { 193 "applies": true, 194 "answer": false, 195 "justification": "No temperature, top-p, max tokens, or other sampling parameters reported despite using the OpenAI API." 196 }, 197 "scaffolding_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Detailed description of sequential orchestration pipeline, shared project state dictionary, Debugger retry logic, artifact-centric communication, and pseudo-code orchestration loop (Listing 1)." 201 }, 202 "data_preprocessing_documented": { 203 "applies": false, 204 "answer": false, 205 "justification": "System demonstration paper with no dataset collection or preprocessing pipeline." 206 } 207 }, 208 "limitations_and_scope": { 209 "limitations_section_present": { 210 "applies": true, 211 "answer": true, 212 "justification": "Dedicated 'Discussion and Limitations' section covering six specific limitations: LLM output quality/error propagation, hallucinations, context window/scalability, lack of learning, evaluation guarantees, and domain constraints." 213 }, 214 "threats_to_validity_specific": { 215 "applies": true, 216 "answer": true, 217 "justification": "Limitations are specific to this system: 'the plan was incomplete, causing the final software to lack a feature,' 'context window (for GPT-4, can be up to 8K or 32K tokens),' 'we cannot guarantee the correctness, completeness, or security of the generated software.'" 218 }, 219 "scope_boundaries_stated": { 220 "applies": true, 221 "answer": false, 222 "justification": "The paper does not explicitly delineate what the results do NOT show. It does not state boundaries like 'we only tested Python CLI tasks' or 'we cannot claim results for other languages/domains.' Domain constraints are mentioned as a limitation but not as explicit scope boundaries." 223 } 224 }, 225 "data_integrity": { 226 "raw_data_available": { 227 "applies": true, 228 "answer": false, 229 "justification": "Case study results (generated code, agent outputs) are shown only as excerpts. No raw logs, complete generated programs, or full agent interaction traces are made available." 230 }, 231 "data_collection_described": { 232 "applies": true, 233 "answer": false, 234 "justification": "The example tasks (to-do app, REST API, 2048 game) are chosen informally with no description of how or why these specific tasks were selected." 235 }, 236 "recruitment_methods_described": { 237 "applies": false, 238 "answer": false, 239 "justification": "No human participants and no benchmark dataset. Self-generated example tasks. Recruitment is not applicable." 240 }, 241 "data_pipeline_documented": { 242 "applies": true, 243 "answer": false, 244 "justification": "No systematic documentation of how many tasks were tried, success/failure rates, or how the presented examples were selected from any larger set of attempts." 245 } 246 }, 247 "conflicts_of_interest": { 248 "funding_disclosed": { 249 "applies": true, 250 "answer": false, 251 "justification": "No funding source or acknowledgments section. No mention of grants, institutional support, or funding of any kind." 252 }, 253 "affiliations_disclosed": { 254 "applies": true, 255 "answer": true, 256 "justification": "The single author's affiliation (Toronto Metropolitan University) is clearly listed on the title page." 257 }, 258 "funder_independent_of_outcome": { 259 "applies": false, 260 "answer": false, 261 "justification": "No funding disclosed. Appears to be unfunded academic work by a single researcher." 262 }, 263 "financial_interests_declared": { 264 "applies": true, 265 "answer": false, 266 "justification": "No competing interests statement or financial disclosure present in the paper." 267 } 268 }, 269 "contamination": { 270 "training_cutoff_stated": { 271 "applies": false, 272 "answer": false, 273 "justification": "The paper does not evaluate GPT-4's capability on a recognized benchmark. It uses GPT-4 within a framework demonstration on self-constructed example tasks. Per schema NA rule: 'NA if the paper does not evaluate a pre-trained model's capability on any benchmark.'" 274 }, 275 "train_test_overlap_discussed": { 276 "applies": false, 277 "answer": false, 278 "justification": "Same reasoning as training_cutoff_stated. Self-constructed example tasks are not a benchmark, so train/test overlap is not applicable per the schema's NA rule." 279 }, 280 "benchmark_contamination_addressed": { 281 "applies": false, 282 "answer": false, 283 "justification": "No standard benchmark used. Self-constructed example tasks (to-do app, REST API, 2048 game)." 284 } 285 }, 286 "human_studies": { 287 "pre_registered": { 288 "applies": false, 289 "answer": false, 290 "justification": "No human participants involved in this study." 291 }, 292 "irb_or_ethics_approval": { 293 "applies": false, 294 "answer": false, 295 "justification": "No human participants involved in this study." 296 }, 297 "demographics_reported": { 298 "applies": false, 299 "answer": false, 300 "justification": "No human participants involved in this study." 301 }, 302 "inclusion_exclusion_criteria": { 303 "applies": false, 304 "answer": false, 305 "justification": "No human participants involved in this study." 306 }, 307 "randomization_described": { 308 "applies": false, 309 "answer": false, 310 "justification": "No human participants involved in this study." 311 }, 312 "blinding_described": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants involved in this study." 316 }, 317 "attrition_reported": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants involved in this study." 321 } 322 }, 323 "cost_and_practicality": { 324 "inference_cost_reported": { 325 "applies": true, 326 "answer": false, 327 "justification": "The paper discusses cost conceptually ('scales roughly with the number of subtasks times the cost of each agent's LLM calls') but provides no actual figures — no token counts, API costs, or wall-clock times." 328 }, 329 "compute_budget_stated": { 330 "applies": true, 331 "answer": false, 332 "justification": "No GPU hours, API spend, hardware specifications, or total computational cost reported." 333 } 334 } 335 } 336 }