calibration.json (19981B)
1 { 2 "paper_slug": "adaptrack-constrained-decoding-2025", 3 "calibration_date": "2026-02-28", 4 "calibration_model": "opus", 5 "scan_model": "sonnet", 6 "total_questions": 50, 7 "agreement_count": 46, 8 "disagreement_count": 4, 9 "agreement_rate": 0.92, 10 "disagreements": [ 11 { 12 "category": "artifacts", 13 "question": "code_released", 14 "sonnet": {"applies": true, "answer": true}, 15 "opus": {"applies": true, "answer": false}, 16 "direction": "sonnet_generous", 17 "explanation": "The paper states 'We open-source the modified package together with our source code' (Section 6.3) but provides NO repository URL or archive link anywhere in the paper text. The schema explicitly states: 'Only YES if a working URL or archive is provided.' A promise or claim of release without a verifiable link does not meet this criterion." 18 }, 19 { 20 "category": "artifacts", 21 "question": "data_released", 22 "sonnet": {"applies": true, "answer": true}, 23 "opus": {"applies": true, "answer": false}, 24 "direction": "sonnet_generous", 25 "explanation": "While HumanEval and MBPP are publicly available benchmarks, the paper's primary novel datasets (TFv1 and TFv1Real) are constructed by the authors and not released with any download link or URL. The open-source claim has no accompanying URL. The schema requires actual availability, not claims of future release alongside code." 26 }, 27 { 28 "category": "artifacts", 29 "question": "environment_specified", 30 "sonnet": {"applies": true, "answer": true}, 31 "opus": {"applies": true, "answer": false}, 32 "direction": "sonnet_generous", 33 "explanation": "The paper names libraries (PyTorch, Transformers, transformers-CFG) and hardware (64-core Intel Xeon, 8 NVIDIA RTX A6000 GPUs) but provides NO library version numbers, no requirements.txt, no Dockerfile, and no detailed environment setup section. The schema says 'Mentioning Python 3.x alone is NOT enough — there must be enough detail to recreate the environment.' Similarly, naming libraries without versions is insufficient to recreate the environment." 34 }, 35 { 36 "category": "claims_and_evidence", 37 "question": "generalization_bounded", 38 "sonnet": {"applies": true, "answer": true}, 39 "opus": {"applies": true, "answer": false}, 40 "direction": "sonnet_generous", 41 "explanation": "The paper's title 'Constrained Decoding without Distorting LLM's Output Intent' and abstract discuss 'language model-based code generation' broadly, but experiments are only on 7B base models in Python and TypeScript. The schema says: 'If the paper tests on Python and claims results for code generation generally, NO.' The threats to validity section (7.3) addresses some concerns but does not explicitly bound generalization to the tested settings (base models only, Python/TypeScript only, 7B-scale primarily). Instruction-tuned models, other languages, and other constraint types are not discussed as scope boundaries." 42 } 43 ], 44 "opus_checklist": { 45 "artifacts": { 46 "code_released": { 47 "applies": true, 48 "answer": false, 49 "justification": "Section 6.3 states 'We open-source the modified package together with our source code' but NO repository URL, GitHub link, or archive link is provided anywhere in the paper. The schema requires 'a working URL or archive.' A claim of open-source release without a verifiable link is NO." 50 }, 51 "data_released": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper's primary novel datasets (TFv1 with 419 APIs, TFv1Real with 1,000 sampled files) are constructed by the authors but no download link or URL is provided. While HumanEval and MBPP are publicly available, the key custom datasets that represent the paper's contribution are not verifiably released." 55 }, 56 "environment_specified": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper names libraries (PyTorch, Transformers, transformers-CFG) and hardware (64-core Intel Xeon, 8 NVIDIA RTX A6000 GPUs) but provides no library version numbers, no requirements.txt, no Dockerfile, and no environment setup section with specific dependency versions. This is insufficient to recreate the experimental environment." 60 }, 61 "reproduction_instructions": { 62 "applies": true, 63 "answer": false, 64 "justification": "No step-by-step reproduction instructions, README with commands, or reproduction scripts are described in the paper." 65 } 66 }, 67 "statistical_methodology": { 68 "confidence_intervals_or_error_bars": { 69 "applies": true, 70 "answer": false, 71 "justification": "All results in Tables 1-4 are point estimates only. No confidence intervals, error bars, or standard deviations are reported for any performance metric." 72 }, 73 "significance_tests": { 74 "applies": true, 75 "answer": false, 76 "justification": "The paper makes comparative claims (e.g., 'AdapTrack performs significantly better') but provides no statistical significance tests — no p-values, t-tests, or similar. All comparisons are raw point estimate comparisons." 77 }, 78 "effect_sizes_reported": { 79 "applies": true, 80 "answer": true, 81 "justification": "The paper reports percentage improvements with baseline context, e.g., 'improvement of 360.87% compared to constrained decoding' with absolute numbers (from 10.98% to 50.60%). This provides sufficient context to assess effect magnitude." 82 }, 83 "sample_size_justified": { 84 "applies": true, 85 "answer": false, 86 "justification": "No justification for sample sizes: 419 APIs in TFv1, 1,000 files in TFv1Real (from 12,883 available), 20 random samples per problem, 2000 samples for DSL. No power analysis or reasoning for these choices." 87 }, 88 "variance_reported": { 89 "applies": true, 90 "answer": false, 91 "justification": "All performance tables (1-4) report single-point results with no standard deviation, variance, or spread measures across runs. Table 5 reports avg/min/max for LM call counts but not for the main performance metrics." 92 } 93 }, 94 "evaluation_design": { 95 "baselines_included": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper compares AdapTrack against unconstrained decoding, constrained decoding (Sections 6.1-6.3), and ASAp (Section 6.4). Multiple appropriate baselines are included." 99 }, 100 "baselines_contemporary": { 101 "applies": true, 102 "answer": true, 103 "justification": "ASAp (Park et al., 2024) is a recent competing method. Unconstrained and constrained decoding are appropriate fundamental baselines for this problem. The paper acknowledges ASAp as the closest prior work." 104 }, 105 "ablation_study": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 7.2.2 and Figure 7 conduct experiments with limited backtracking distances (0, 1, 2, 4, 8), isolating the key component's contribution. Robustness experiments on temperature (Section 7.1.1) and model size (Section 7.1.2) provide additional component analysis." 109 }, 110 "multiple_metrics": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper uses EM@k (exact match at k=1,3,5,10,20), pass@k for general code generation, and KL divergence for distribution alignment — at least three distinct metric types." 114 }, 115 "human_evaluation": { 116 "applies": false, 117 "answer": false, 118 "justification": "This is a decoding algorithm evaluated on automated code correctness metrics (exact match, pass@k, KL divergence). Human evaluation is not relevant to claims about distribution alignment and constraint compliance." 119 }, 120 "held_out_test_set": { 121 "applies": true, 122 "answer": true, 123 "justification": "HumanEval and MBPP are standard held-out benchmarks. TFv1 is a synthetic evaluation dataset not used for tuning. TFv1Real was randomly sampled from 12,883 files for evaluation only. No evidence of data leakage between tuning and evaluation." 124 }, 125 "per_category_breakdown": { 126 "applies": true, 127 "answer": true, 128 "justification": "Results are broken down by model (4 code LLMs), by dataset (TFv1 v1/v2, TFv1Real, HumanEval, MBPP), and by DSL subdataset (SLIA, INV-BV, CP, binary in Table 4). Individual model performance is visible in all tables." 129 }, 130 "failure_cases_discussed": { 131 "applies": true, 132 "answer": true, 133 "justification": "Section 6.3 discusses cases where AdapTrack performs worse than unconstrained decoding (pass@1 for StarCoder2 7B and CodeLlama 7B on MBPP), attributing it to the constrainer's limitation. Section 6.4 discusses INV-BV where all methods perform similarly." 134 }, 135 "negative_results_reported": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper explicitly reports that AdapTrack sometimes performs worse than unconstrained decoding (Table 3: pass@1 for StarCoder2 7B and CodeLlama 7B on MBPP), and notes the outlier in pass@20 of CodeLlama 7B on HumanEval." 139 } 140 }, 141 "claims_and_evidence": { 142 "abstract_claims_supported": { 143 "applies": true, 144 "answer": true, 145 "justification": "Abstract claims of 360.87% improvement on TFv1, 38.93% on TFv1Real, 7.84% on HumanEval, and 6.42% on MBPP are confirmed in Tables 1, 2, and 3. The theoretical proof is in Section 5." 146 }, 147 "causal_claims_justified": { 148 "applies": true, 149 "answer": true, 150 "justification": "The paper's causal claim that AdapTrack improves performance by avoiding distribution distortion is supported by: (1) mathematical proof in Section 5, (2) controlled experiments where only the decoding method varies, (3) KL divergence experiments (RQ4) demonstrating distribution alignment. These constitute adequate causal evidence." 151 }, 152 "generalization_bounded": { 153 "applies": true, 154 "answer": false, 155 "justification": "The title 'Constrained Decoding without Distorting LLM's Output Intent' and abstract frame the contribution broadly for 'language model-based code generation.' However, experiments are conducted only on 7B base models (with 0.5B-32B robustness check), only in Python and TypeScript, and only with specific constrainer types. The paper does not explicitly bound its claims to these settings. The schema says 'If the paper tests on Python and claims results for code generation generally, NO.'" 156 }, 157 "alternative_explanations_discussed": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 7.3 discusses two specific alternative explanations: (1) whether the problem only manifests with deprecated APIs (7.3.1), mitigated by TFv1 v1-setting and RQ3 experiments; (2) whether the problem is model-capability-dependent (7.3.2), mitigated by multi-model experiments. These are specific to this study." 161 } 162 }, 163 "setup_transparency": { 164 "model_versions_specified": { 165 "applies": true, 166 "answer": true, 167 "justification": "Specific model names are given: Qwen2.5 Coder 7B, DeepSeek Coder Base 6.7B, StarCoder2 7B, CodeLlama Python 7B, Mistral-7B. These are open-weight models whose names uniquely identify specific model checkpoints on HuggingFace, unlike API-based models that change over time." 168 }, 169 "prompts_provided": { 170 "applies": true, 171 "answer": false, 172 "justification": "Figure 3 shows an example code context format for TFv1, but actual prompts for all problems are not provided. For RQ4, prompts contain 'a grammar and specification' with '3 in-context examples' but the actual prompt text is not shown. HumanEval/MBPP use standard prompts from MultiPL-E but the paper does not include them." 173 }, 174 "hyperparameters_reported": { 175 "applies": true, 176 "answer": true, 177 "justification": "Temperature specified as 1 for main experiments (Section 7.1.1). Temperature range tested (0.1-1.0). Top-p variant for RQ3 described. 60-second timeout per token validity check specified. 20 samples per problem for EM@k. 2000 samples for DSL experiments." 178 }, 179 "scaffolding_described": { 180 "applies": false, 181 "answer": false, 182 "justification": "AdapTrack is a decoding algorithm operating at the token generation level. There is no agentic scaffolding (no tool use, memory management, or agent loops)." 183 }, 184 "data_preprocessing_documented": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 6.1 describes TFv1 construction: collected all TensorFlow 2.16 APIs, filtered for tf.compat.v1 prefix, excluded APIs with v2 counterparts, retained only lowercase-starting APIs, removed prefix-able APIs, yielding 419 APIs. Section 6.2 describes TFv1Real: 46,785 files → deduplicated/split → removed prefix files → removed non-API suffixes → 14,237 → filtered >3,584 tokens → 12,883 → randomly sampled 1,000." 188 } 189 }, 190 "limitations_and_scope": { 191 "limitations_section_present": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 7.3 'Threats to validity' provides a dedicated subsection with two specific subsections (7.3.1 Dataset selection, 7.3.2 Model selection) containing substantive discussion." 195 }, 196 "threats_to_validity_specific": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 7.3.1 specifically addresses whether the distortion problem only occurs with deprecated APIs. Section 7.3.2 addresses whether the improvement is model-capability-specific. Both are specific to this study and mitigated with additional experiments." 200 }, 201 "scope_boundaries_stated": { 202 "applies": true, 203 "answer": false, 204 "justification": "The paper does not explicitly state what the results do NOT show. No discussion of whether results transfer to instruction-tuned models (only base models tested), to languages beyond Python/TypeScript, to other constraint types, or to production settings. The threats section focuses on rebutting concerns rather than explicitly bounding scope." 205 } 206 }, 207 "data_integrity": { 208 "raw_data_available": { 209 "applies": true, 210 "answer": false, 211 "justification": "The paper claims open-source release but provides no URL. Neither TFv1 nor TFv1Real datasets are available for independent verification from the paper alone. Raw experimental outputs are not provided." 212 }, 213 "data_collection_described": { 214 "applies": true, 215 "answer": true, 216 "justification": "TFv1 construction: collected legal TensorFlow 2.16 APIs from official website, applied explicit filtering criteria, yielding 419 APIs (Section 6.1). TFv1Real: collected 46,785 Python files from GitHub using tf.compat.v1 API search, applied documented filtering pipeline (Section 6.2)." 217 }, 218 "recruitment_methods_described": { 219 "applies": false, 220 "answer": false, 221 "justification": "No human participants. Data sourced from GitHub repositories (public data) and official TensorFlow API documentation." 222 }, 223 "data_pipeline_documented": { 224 "applies": true, 225 "answer": true, 226 "justification": "Full pipeline documented for both datasets with explicit counts at each stage: TFv1 (all TF 2.16 APIs → filter compat.v1 → exclude v2 counterparts → lowercase filter → prefix filter → 419). TFv1Real (46,785 → dedup/split → 14,237 → 12,883 → 1,000)." 227 } 228 }, 229 "conflicts_of_interest": { 230 "funding_disclosed": { 231 "applies": true, 232 "answer": true, 233 "justification": "Acknowledgments section lists: National Key R&D Program (2023YFB4503801), NSFC grants (62192733, 62192730, 62192731), and Hubei Province Major Program (2023BAA024)." 234 }, 235 "affiliations_disclosed": { 236 "applies": true, 237 "answer": true, 238 "justification": "Author affiliations clearly listed: Yongmin Li, Ge Li, and Zhi Jin at Peking University (Key Lab of High Confidence Software Technology); Jia Li at Tsinghua University (College of AI)." 239 }, 240 "funder_independent_of_outcome": { 241 "applies": true, 242 "answer": true, 243 "justification": "Funding from Chinese government research programs (NSFC, National Key R&D Program, Hubei Province) which have no financial stake in whether AdapTrack outperforms constrained decoding." 244 }, 245 "financial_interests_declared": { 246 "applies": true, 247 "answer": false, 248 "justification": "No competing interests or financial interests statement is present anywhere in the paper. The schema states: 'If there is no competing interests statement at all, NO — absence of disclosure is not the same as absence of conflict.'" 249 } 250 }, 251 "contamination": { 252 "training_cutoff_stated": { 253 "applies": true, 254 "answer": false, 255 "justification": "The paper uses HumanEval and MBPP benchmarks with LLMs (Qwen2.5 Coder, DeepSeek Coder, StarCoder2, CodeLlama, Mistral) but does not state the training data cutoff date for any of these models." 256 }, 257 "train_test_overlap_discussed": { 258 "applies": true, 259 "answer": false, 260 "justification": "HumanEval (2021) and MBPP (2021) are used with models trained after these benchmarks were published. No discussion of potential overlap between training data and benchmark examples." 261 }, 262 "benchmark_contamination_addressed": { 263 "applies": true, 264 "answer": false, 265 "justification": "HumanEval and MBPP were both published in 2021. All models used were trained well after 2021 and may have seen these benchmarks in training. The paper does not address this contamination risk, which could inflate baseline numbers and affect relative improvement measurements." 266 } 267 }, 268 "human_studies": { 269 "pre_registered": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants. This is a benchmark evaluation of a decoding algorithm." 273 }, 274 "irb_or_ethics_approval": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants. IRB approval not applicable." 278 }, 279 "demographics_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 }, 284 "inclusion_exclusion_criteria": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in this study." 288 }, 289 "randomization_described": { 290 "applies": false, 291 "answer": false, 292 "justification": "No human participants in this study." 293 }, 294 "blinding_described": { 295 "applies": false, 296 "answer": false, 297 "justification": "No human participants in this study." 298 }, 299 "attrition_reported": { 300 "applies": false, 301 "answer": false, 302 "justification": "No human participants in this study." 303 } 304 }, 305 "cost_and_practicality": { 306 "inference_cost_reported": { 307 "applies": true, 308 "answer": true, 309 "justification": "Table 5 reports the number of LM calls per sample (avg, min, max) for all methods across all models, showing AdapTrack requires approximately 20-60% more LM calls. This is a meaningful proxy for inference cost." 310 }, 311 "compute_budget_stated": { 312 "applies": true, 313 "answer": false, 314 "justification": "Hardware is specified (8 NVIDIA RTX A6000 GPUs, 64-core Intel Xeon) but total GPU hours, wall-clock time for all experiments, or total compute budget is not quantified." 315 } 316 } 317 } 318 }