calibration.json (15745B)
1 { 2 "paper_slug": "adaplanner-adaptive-planning-2023", 3 "total_questions": 50, 4 "agreement_count": 49, 5 "disagreement_count": 1, 6 "agreement_rate": 0.98, 7 "disagreements": [ 8 { 9 "category": "statistical_methodology", 10 "question": "effect_sizes_reported", 11 "sonnet_applies": true, 12 "sonnet_answer": false, 13 "opus_applies": true, 14 "opus_answer": true, 15 "opus_justification": "The paper reports percentage improvements with baseline context: '3.73% improvement' from 88.06% to 91.79% (ALFWorld) and '4.11% improvement' from ~88.76% to 92.87% (MiniWoB++). The schema description explicitly states that 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' Tables 2 and 3 provide the raw baseline scores alongside AdaPlanner scores, giving readers full effect magnitude context.", 16 "sonnet_justification": "The paper reports raw performance differences (e.g., 3.73% and 4.11% improvement) but does not report standardized effect sizes such as Cohen's d. The raw absolute differences are mentioned but framed without statistical context.", 17 "direction": "opus_generous" 18 } 19 ], 20 "opus_checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The abstract explicitly provides a GitHub URL: 'The implementation of AdaPlanner is available on https://github.com/haotiansun14/AdaPlanner.'" 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses publicly available standard benchmarks: ALFWorld and MiniWoB++. No proprietary data was collected or modified." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed library version listing is provided. The appendix describes experimental setup at a high level but lacks sufficient detail to recreate the software environment." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "While a GitHub link is provided, the paper itself contains no step-by-step reproduction instructions, no README with commands, and no 'Reproducing Results' section." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "Tables 2 and 3 report success rates as point estimates only (e.g., '91.79%'). No confidence intervals, error bars, or ± notation appear anywhere in the paper." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims AdaPlanner 'outperforms state-of-the-art baselines by 3.73% and 4.11%' but provides no statistical significance tests (no p-values, t-tests, bootstrap tests, or equivalent)." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "The paper reports percentage improvements with baseline context: 3.73% improvement from 88.06% to 91.79% in ALFWorld and 4.11% improvement in MiniWoB++. Tables 2 and 3 provide the raw baseline scores alongside AdaPlanner scores. Per the schema example, percentage improvement with baseline context counts as YES." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The 134 ALFWorld tasks and 53 MiniWoB++ tasks are described but there is no justification for why these sample sizes are sufficient and no power analysis is provided." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "All results in Tables 2 and 3 are single point estimates. No mention of multiple runs, standard deviation, IQR, or any spread measure. Result stability is impossible to assess." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper compares against multiple baselines: BUTLER, ReAct, Reflexion (ALFWorld) and CC-Net, WGE, WebN-T5-3B, RCI (MiniWoB++), covering both training-based and prompting-based methods." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Baselines include contemporaneous work from 2023 (Reflexion, RCI, ReAct) alongside older training-based methods, representing reasonable state of the art." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Figure 4c ablates the code interface (CI) and Figure 4d ablates skill discovery (SD), each showing the contribution of individual components via controlled removal." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": false, 89 "justification": "Only success rate (%) is used. Section 8.1 states 'we use success rate (%) to evaluate the performance of tested methods' and no additional metrics are reported." 90 }, 91 "human_evaluation": { 92 "applies": false, 93 "answer": false, 94 "justification": "Evaluation is entirely automated via task success/failure in simulated environments (ALFWorld, MiniWoB++). Human evaluation is irrelevant to these automated benchmark claims." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Results are reported on standard benchmark test sets (134 ALFWorld tasks, 53 MiniWoB++ tasks). Few-shot demonstrations used for prompting are separate from evaluated tasks." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Table 2 provides per-task-type results for all six ALFWorld types. Table 3 separates MiniWoB++ into feedback/no-feedback subsets. Appendix Table 5 gives per-task results for all 53 MiniWoB++ tasks." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": false, 109 "justification": "Appendix 8.4 shows trajectory examples where baselines fail due to hallucination, but there is no error analysis or discussion of where AdaPlanner itself breaks down. The 47.06% success on 'Pick two' is not analyzed. No systematic failure case discussion for the proposed method." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Ablation studies (Figures 4c, 4d) show performance drops when components are removed. Table 2 shows gpt-3.5-turbo underperforms text-davinci-002, which is explicitly discussed in Section 4 as an unexpected finding." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims of 3.73% and 4.11% improvement with 2x and 600x fewer samples are supported by Tables 2, 3, and Figure 3." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims via ablation studies (Figures 4c, 4d) are supported by controlled single-variable manipulation — removing code interface or skill discovery individually and measuring the impact." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper tests on two specific text-based simulation environments but uses broad language like 'sequential decision-making tasks' without bounding claims to these specific environments. The title and framing suggest general applicability." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "No alternative explanations for results are discussed. For instance, performance differences could stem from prompt engineering quality rather than the closed-loop mechanism, but this is not addressed." 137 } 138 }, 139 "setup_transparency": { 140 "model_versions_specified": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper specifies 'gpt-3.5-turbo', 'text-davinci-002', and 'text-davinci-003' as specific model API identifiers in Tables 2 and 3 and Section 8.2. text-davinci-002 and text-davinci-003 are fixed model versions." 144 }, 145 "prompts_provided": { 146 "applies": true, 147 "answer": true, 148 "justification": "Appendix 8.3 provides full prompt text for all stages (basic_info, initial_planning, code_check, refinement, start_from) with actual expert trajectory examples for all six ALFWorld task types and MiniWoB++ tasks." 149 }, 150 "hyperparameters_reported": { 151 "applies": true, 152 "answer": false, 153 "justification": "No temperature, top-p, max tokens, or other LLM sampling hyperparameters are reported anywhere in the paper or appendix." 154 }, 155 "scaffolding_described": { 156 "applies": true, 157 "answer": true, 158 "justification": "The agentic scaffolding is described in detail: planner/refiner dual roles, in-plan and out-of-plan refinement, ask_LLM() action, assertion-based error detection, refine-then-resume mechanism, and skill discovery memory are all described in Sections 3.1-3.3." 159 }, 160 "data_preprocessing_documented": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section 8.1 describes the environments, task selection (9 MiniWoB++ tasks with feedback, 53 tasks from RCI), and demonstration allocation (Table 4). The skill discovery pipeline is described in Section 3.3." 164 } 165 }, 166 "limitations_and_scope": { 167 "limitations_section_present": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 6 'Conclusion and Limitations' includes a dedicated limitations discussion noting that AdaPlanner still requires few-shot expert demonstrations for complex tasks." 171 }, 172 "threats_to_validity_specific": { 173 "applies": true, 174 "answer": false, 175 "justification": "The limitations section mentions only one limitation (need for few-shot demonstrations). No specific threats to validity are discussed such as benchmark contamination, sensitivity to prompt phrasing, or the limited scope of the two test environments." 176 }, 177 "scope_boundaries_stated": { 178 "applies": true, 179 "answer": false, 180 "justification": "The paper does not explicitly state what the results do NOT show. No bounding of results to text-based environments or these specific benchmarks. Broader applicability is implied without explicit scope boundaries." 181 } 182 }, 183 "data_integrity": { 184 "raw_data_available": { 185 "applies": true, 186 "answer": false, 187 "justification": "No raw experimental logs, trajectories, or per-episode results are provided for independent verification. Only aggregate success rates are reported." 188 }, 189 "data_collection_described": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 8.1 describes the environments, task types, and how demonstrations were collected (38 human-written + 21 discovered by skill discovery for MiniWoB++, 6 expert samples for ALFWorld)." 193 }, 194 "recruitment_methods_described": { 195 "applies": false, 196 "answer": false, 197 "justification": "No human participants are involved. The evaluation uses automated environments with no human subjects. Standard benchmark data source makes this NA." 198 }, 199 "data_pipeline_documented": { 200 "applies": true, 201 "answer": true, 202 "justification": "The evaluation pipeline is documented: how tasks are selected, how episodes are run, how success/failure is determined (Section 8.1), and how skill discovery produces additional examples (Section 3.3)." 203 } 204 }, 205 "conflicts_of_interest": { 206 "funding_disclosed": { 207 "applies": true, 208 "answer": false, 209 "justification": "No acknowledgments section or funding disclosure is present anywhere in the paper." 210 }, 211 "affiliations_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "All five authors are affiliated with Georgia Institute of Technology, as clearly stated on the first page. No authors appear to be affiliated with OpenAI or other evaluated model providers." 215 }, 216 "funder_independent_of_outcome": { 217 "applies": false, 218 "answer": false, 219 "justification": "No funding is disclosed. The schema indicates NA if unfunded. All authors are university researchers with no disclosed external funding." 220 }, 221 "financial_interests_declared": { 222 "applies": true, 223 "answer": false, 224 "justification": "There is no competing interests or financial disclosure statement in the paper. Absence of disclosure is not absence of conflict." 225 } 226 }, 227 "contamination": { 228 "training_cutoff_stated": { 229 "applies": true, 230 "answer": false, 231 "justification": "The paper uses GPT-3 (text-davinci-002), GPT-3.5 (text-davinci-003, gpt-3.5-turbo) but does not state the training data cutoff dates for any of these models." 232 }, 233 "train_test_overlap_discussed": { 234 "applies": true, 235 "answer": false, 236 "justification": "No discussion of whether ALFWorld or MiniWoB++ benchmarks or their task descriptions may have appeared in the GPT-3/3.5 training data." 237 }, 238 "benchmark_contamination_addressed": { 239 "applies": true, 240 "answer": false, 241 "justification": "ALFWorld (2021) and MiniWoB++ (2018) were both published before GPT-3/3.5 training data cutoffs. The paper does not address the possibility that task descriptions or solutions appeared in training data." 242 } 243 }, 244 "human_studies": { 245 "pre_registered": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this study." 249 }, 250 "irb_or_ethics_approval": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "demographics_reported": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "inclusion_exclusion_criteria": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "randomization_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 }, 270 "blinding_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved in this study." 274 }, 275 "attrition_reported": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants are involved in this study." 279 } 280 }, 281 "cost_and_practicality": { 282 "inference_cost_reported": { 283 "applies": true, 284 "answer": false, 285 "justification": "No API costs, token counts, or wall-clock time per episode are reported. The paper mentions reducing LLM calls as a benefit but does not quantify actual inference costs." 286 }, 287 "compute_budget_stated": { 288 "applies": true, 289 "answer": false, 290 "justification": "No total compute budget, GPU hours, or API spend is reported. The computational resources required to run the experiments are not quantified." 291 } 292 } 293 } 294 }