calibration.json (17500B)
1 { 2 "calibration": { 3 "paper_slug": "agentsllm-augmentative-generation-2025", 4 "calibration_date": "2026-02-28", 5 "sonnet_scan_date": "2026-02-28", 6 "agreement_rate": 0.98, 7 "total_questions": 50, 8 "agreements": 49, 9 "disagreements": 1, 10 "disagreement_details": [ 11 { 12 "category": "setup_transparency", 13 "question": "prompts_provided", 14 "sonnet": { "applies": true, "answer": true }, 15 "opus": { "applies": true, "answer": false }, 16 "direction": "sonnet_generous", 17 "explanation": "Sonnet credits Figure 2 as providing sufficient prompt text for the SMA. Opus notes that only the SMA prompt is shown in full; the QA agents' prompts (Text QA Agent, Visual QA Engineer, Visual QA Agent) are described only in natural language without verbatim text. The schema requires that 'the reader must be able to reconstruct every prompt sent to the model,' which is not possible for the tQA and vQA variants. Additionally, the 'list of common problems compiled from typical mistakes observed during initial experimentation' given to the Text QA Agent is referenced but never provided. Opus answer: false." 18 } 19 ], 20 "opus_checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "The conclusion states 'The code for this paper will be made available,' which is a promise of future release. A GitHub link is in the header but the conclusion confirms code is not yet available. Per schema, a promise counts as NO." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses publicly available interPlan scenarios [9] and the nuPlan benchmark [2]. These are standard public benchmarks the authors did not modify. No proprietary data was collected." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No requirements.txt, Dockerfile, conda environment file, or library version listing is provided. Only LLM model names are mentioned without software dependency details." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method description is high-level and insufficient for a competent researcher to reproduce without guessing." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": true, 47 "justification": "Table IV reports 95% confidence intervals via bootstrapping for Elo ratings for all nine variants (e.g., interPlan: 1042, CI -9/+11). CIs are not provided for displacement error, but the Elo CIs satisfy the criterion." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "No formal significance tests (p-values, t-tests, etc.) are used. Claims like 'significantly larger error' for LCTGen and 'significantly fewer errors' for GPT-4o are made without statistical testing. The Elo CI-based ranking is not a formal hypothesis test." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are computed. Raw displacement errors and Elo point differences are reported but without standardized effect size measures. Table V provides driving scores with baseline context but no formal effect size metric." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The 50-scenario test set and 9 expert judges are not justified with power analysis or explicit reasoning. No acknowledgment that these numbers may be insufficient for certain claims." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Figures 3 and 4 show box plots (distributions across scenarios) but these are not variance across experimental runs — each configuration appears to be run once per scenario. Table V reports only mean driving scores with no variance. No standard deviation or IQR is reported in tabular form." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "LCTGen [23] serves as an external baseline for displacement error (Figure 3). Human-generated interPlan scenarios serve as ground truth in the Elo comparison (Table IV)." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "LCTGen (CoRL 2023) and interPlan (IROS 2024) are both recent and appropriate for the task. No suspiciously old or weak baselines." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Table II and Section V.B present four variants (OTM, FC, tQA, vQA) that systematically vary components: lane representation and QA loop type. This constitutes an ablation over framework components." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Three metrics: (1) displacement error in meters, (2) Elo rating from human pairwise comparisons, (3) nuPlan mean driving score." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "Nine domain experts conducted 5760 pairwise comparisons of BEV images in a blind setup with model identities hidden. Elo ratings with 95% CIs computed." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": false, 99 "justification": "The same 50 interPlan scenarios are used for both development ('ground-truth for development') and final evaluation. No held-out split is described." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Figure 3 shows displacement error by five scenario types. Table III shows error counts by three error categories (position, heading, logic) per model." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section V.A defines three error categories with Table III quantifying them. Figure 5 provides qualitative failure examples (e.g., 'Gemini placed the accident vehicles with unrealistically large overlap')." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Multiple negative results: vQA does not improve displacement error over tQA (Section V.B); smaller models significantly underperform with OTM; LCTGen fails entirely on two scenario types." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims about comparable quality to manual scenarios are supported by Table IV (GPT-4o OTM Elo 1039 vs. interPlan 1042, overlapping CIs, both rank 1). The claim about smaller models is supported by Gemini vQA reaching rank 1." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims about the agentic framework enabling smaller models are supported by controlled ablation experiments (Table II, Figures 3-4) that vary prompting strategy while holding the model constant, allowing causal attribution." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "Tested on only 50 scenarios from one dataset (interPlan) covering five types on nuPlan. The title and abstract make broad claims about 'autonomous driving planners' and 'LLM-agent based framework' without bounding results to this specific setting." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "No alternative explanations are discussed. The discrepancy between vQA improving Elo but not displacement error is not explained (could be evaluator visual bias). No confounds or robustness checks are discussed." 137 } 138 }, 139 "setup_transparency": { 140 "model_versions_specified": { 141 "applies": true, 142 "answer": false, 143 "justification": "Uses 'GPT-4o', 'Gemini-1.5-Flash', 'Llama3.1-70B' without API version numbers or snapshot dates. Per schema, marketing names without snapshot dates do not count." 144 }, 145 "prompts_provided": { 146 "applies": true, 147 "answer": false, 148 "justification": "Figure 2 shows the SMA prompt with actual values for one example, but the QA agents' prompts (Text QA, Visual QA Engineer, Visual QA Agent) are only described in natural language. The 'list of common problems' given to the Text QA Agent is referenced but not provided. The reader cannot reconstruct all prompts used across tQA and vQA variants." 149 }, 150 "hyperparameters_reported": { 151 "applies": true, 152 "answer": false, 153 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) reported. The paper states models are used 'without any problem-specific fine-tuning' but does not report inference sampling settings." 154 }, 155 "scaffolding_described": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section III describes the agentic scaffold in detail: SMA, Text QA Agent, Visual QA Engineer, Visual QA Agent with their roles, inputs, outputs, rating mechanism (1-5 scale), feedback loops, and retry logic. Figure 1 provides a workflow diagram." 159 }, 160 "data_preprocessing_documented": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section III.C and Table I document the text-based scenario representation (entity types, attributes, vector format). The transformation from raw interPlan scenarios to input vectors is documented." 164 } 165 }, 166 "limitations_and_scope": { 167 "limitations_section_present": { 168 "applies": true, 169 "answer": false, 170 "justification": "No dedicated limitations section. One sentence in the conclusion: 'One limitation is the dependence on commercial frontier LLMs.' Per schema, a single sentence in the conclusion does not count as substantive discussion." 171 }, 172 "threats_to_validity_specific": { 173 "applies": true, 174 "answer": false, 175 "justification": "No threats to validity discussed. The one mentioned limitation (API dependence) is a practical concern, not a validity threat. Specific threats (small test set, evaluator bias, no held-out split) are not discussed." 176 }, 177 "scope_boundaries_stated": { 178 "applies": true, 179 "answer": false, 180 "justification": "No explicit out-of-scope statements. Results are not bounded to the five interPlan scenario types, the nuPlan map format, or the specific LLMs tested." 181 } 182 }, 183 "data_integrity": { 184 "raw_data_available": { 185 "applies": true, 186 "answer": false, 187 "justification": "Raw generated scenarios and expert judgment data (5760 pairwise comparisons) are not released. The underlying data cannot be independently verified." 188 }, 189 "data_collection_described": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section IV describes the 50 interPlan scenarios across five types, how they are used for development and evaluation, and how expert pairwise comparisons were collected." 193 }, 194 "recruitment_methods_described": { 195 "applies": true, 196 "answer": false, 197 "justification": "Nine expert judges described only as 'experts from the autonomous driving research community.' No description of how they were recruited, what criteria defined 'expert,' or whether they were internal colleagues or external." 198 }, 199 "data_pipeline_documented": { 200 "applies": true, 201 "answer": true, 202 "justification": "The pipeline is documented: interPlan scenarios → text-based representation → LLM modification → optional QA loop → nuPlan simulation → evaluation. Pairwise comparison → Elo computation is also described." 203 } 204 }, 205 "conflicts_of_interest": { 206 "funding_disclosed": { 207 "applies": true, 208 "answer": false, 209 "justification": "No acknowledgments section or funding disclosure. Multiple authors are from Robert Bosch GmbH, a major industry player, but no explicit funding statement is made." 210 }, 211 "affiliations_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Author affiliations clearly listed: Robert Bosch GmbH, FAU Erlangen-Nürnberg, University of Toronto, ScaDS.AI/TU Dresden, University of Tübingen, Vector Institute." 215 }, 216 "funder_independent_of_outcome": { 217 "applies": true, 218 "answer": false, 219 "justification": "Multiple authors affiliated with Robert Bosch GmbH, which has commercial interests in autonomous driving testing. The employer has a direct stake in demonstrating the framework's effectiveness." 220 }, 221 "financial_interests_declared": { 222 "applies": true, 223 "answer": false, 224 "justification": "No competing interests statement or financial interest declaration is present in the paper." 225 } 226 }, 227 "contamination": { 228 "training_cutoff_stated": { 229 "applies": false, 230 "answer": false, 231 "justification": "The paper uses LLMs for generative scenario creation (instruction-following), not for benchmark knowledge retrieval. The LLMs are not being evaluated on what they know from training data. Contamination is structurally inapplicable." 232 }, 233 "train_test_overlap_discussed": { 234 "applies": false, 235 "answer": false, 236 "justification": "Not applicable — the task is generative scenario augmentation from user instructions, not benchmark knowledge retrieval. No meaningful train/test contamination concern." 237 }, 238 "benchmark_contamination_addressed": { 239 "applies": false, 240 "answer": false, 241 "justification": "Not applicable — the paper does not evaluate pre-trained model knowledge on a benchmark. nuPlan and interPlan are used to evaluate driving planners, not LLM knowledge." 242 } 243 }, 244 "human_studies": { 245 "pre_registered": { 246 "applies": true, 247 "answer": false, 248 "justification": "No pre-registration link provided (no OSF, AsPredicted, or other registry). The human expert evaluation study is not pre-registered." 249 }, 250 "irb_or_ethics_approval": { 251 "applies": true, 252 "answer": false, 253 "justification": "No mention of IRB or ethics board approval despite involving nine human participants in an evaluation study." 254 }, 255 "demographics_reported": { 256 "applies": true, 257 "answer": false, 258 "justification": "Expert judges described only as 'experts from the autonomous driving research community.' No demographics (experience level, years, gender, geographic distribution) reported." 259 }, 260 "inclusion_exclusion_criteria": { 261 "applies": true, 262 "answer": false, 263 "justification": "No inclusion/exclusion criteria for expert judges stated beyond 'experts from the autonomous driving research community.' No formal screening or expertise definition." 264 }, 265 "randomization_described": { 266 "applies": true, 267 "answer": true, 268 "justification": "Section IV states 'The order of match-ups were randomized,' indicating randomization of pairwise comparison order to prevent ordering effects." 269 }, 270 "blinding_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Section IV states 'the identities of the models were hidden from the judges during rating,' confirming evaluators were blinded to model identity." 274 }, 275 "attrition_reported": { 276 "applies": true, 277 "answer": false, 278 "justification": "No attrition information. 5760 comparisons across 9 judges stated but no per-judge breakdown or dropout information provided." 279 } 280 }, 281 "cost_and_practicality": { 282 "inference_cost_reported": { 283 "applies": true, 284 "answer": false, 285 "justification": "Cost is discussed qualitatively ('very cost inefficient' for vQA with frontier models) but no actual figures (API spend, token counts, cost per scenario) are reported despite cost reduction being a stated motivation." 286 }, 287 "compute_budget_stated": { 288 "applies": true, 289 "answer": false, 290 "justification": "No total computational budget stated. No GPU hours, API spend, or hardware specifications provided." 291 } 292 } 293 } 294 } 295 }