scan.json (14084B)
1 { 2 "paper": { 3 "title": "Masked Hard-Attention Transformers Recognize Exactly the Star-Free Languages", 4 "authors": ["Andy Yang", "David Chiang", "Dana Angluin"], 5 "year": 2023, 6 "venue": "Neural Information Processing Systems", 7 "arxiv_id": "2310.13897", 8 "doi": "10.52202/079017-0327" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "methodology_tags": ["theoretical"], 13 "key_findings": "The paper establishes exact characterizations of masked hard-attention transformers in terms of formal language classes. With strict masking and no position embeddings, these transformers recognize exactly the star-free languages (equivalent to LTL). Boolean RASP (B-RASP) serves as a convenient intermediate language. The paper further shows that position embeddings, strict vs. non-strict masking, and depth all affect expressive power in precisely characterized ways.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides a B-RASP simulator at https://b-rasp.github.io/ (Section 3.2)." 20 }, 21 "data_released": { 22 "applies": false, 23 "answer": false, 24 "justification": "Purely theoretical paper with no dataset." 25 }, 26 "environment_specified": { 27 "applies": false, 28 "answer": false, 29 "justification": "Purely theoretical paper with no computational experiments requiring environment setup." 30 }, 31 "reproduction_instructions": { 32 "applies": false, 33 "answer": false, 34 "justification": "Theoretical paper — proofs are self-contained in the paper and appendices." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": false, 40 "answer": false, 41 "justification": "No empirical experiments are conducted." 42 }, 43 "significance_tests": { 44 "applies": false, 45 "answer": false, 46 "justification": "No empirical experiments are conducted." 47 }, 48 "effect_sizes_reported": { 49 "applies": false, 50 "answer": false, 51 "justification": "No empirical experiments are conducted." 52 }, 53 "sample_size_justified": { 54 "applies": false, 55 "answer": false, 56 "justification": "Theoretical paper with no samples." 57 }, 58 "variance_reported": { 59 "applies": false, 60 "answer": false, 61 "justification": "No empirical experiments are conducted." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": false, 67 "answer": false, 68 "justification": "Theoretical paper establishing equivalences via proofs, not experimental comparisons." 69 }, 70 "baselines_contemporary": { 71 "applies": false, 72 "answer": false, 73 "justification": "No experimental baselines; the paper positions against prior theoretical results." 74 }, 75 "ablation_study": { 76 "applies": false, 77 "answer": false, 78 "justification": "No system with components to ablate." 79 }, 80 "multiple_metrics": { 81 "applies": false, 82 "answer": false, 83 "justification": "No empirical evaluation with metrics." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Theoretical paper; human evaluation is irrelevant." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "No data or evaluation sets used." 94 }, 95 "per_category_breakdown": { 96 "applies": false, 97 "answer": false, 98 "justification": "No empirical results to break down." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6 (Limitations) discusses what the results do NOT cover: softmax attention, unmasked attention, and the restriction of position embeddings to finite image." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper shows that non-strict masking reduces expressivity (Section 5.2), and identifies limitations of the framework (Section 6). These are negative results about the model's capabilities." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "All abstract claims (equivalence with star-free languages, role of B-RASP, effects of position embeddings/strict masking/depth) are supported by formal theorems with proofs in the paper and appendices." 116 }, 117 "causal_claims_justified": { 118 "applies": false, 119 "answer": false, 120 "justification": "The paper makes mathematical equivalence claims, not causal claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper is precise about what variants of transformers are covered (hard attention, specific masking types) and explicitly states in Section 6 that results do not apply to softmax-attention or unmasked transformers." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": false, 129 "answer": false, 130 "justification": "Theoretical paper with formal proofs; alternative explanations are not applicable to mathematical proofs." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": false, 134 "answer": false, 135 "justification": "Theoretical paper with no measurements." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": false, 141 "answer": false, 142 "justification": "No LLM or model is used in experiments." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "No prompting is used." 148 }, 149 "hyperparameters_reported": { 150 "applies": false, 151 "answer": false, 152 "justification": "No experiments with hyperparameters." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used." 158 }, 159 "data_preprocessing_documented": { 160 "applies": false, 161 "answer": false, 162 "justification": "No data preprocessing; purely theoretical." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 is titled 'Limitations' and substantively discusses three key limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6 identifies specific limitations: results apply only to hard attention (not softmax), only to masked attention (not unmasked), and the finite-image restriction on position embeddings does not exactly match standard definitions." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 6 explicitly states what the results do NOT apply to: softmax-attention transformers, unmasked attention, and standard (non-rational) sinusoidal position embeddings." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": false, 185 "answer": false, 186 "justification": "No empirical data collected." 187 }, 188 "data_collection_described": { 189 "applies": false, 190 "answer": false, 191 "justification": "No data collection; theoretical paper." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No participants or data recruitment." 197 }, 198 "data_pipeline_documented": { 199 "applies": false, 200 "answer": false, 201 "justification": "No data pipeline; theoretical paper." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section mentioning grants or funding sources is present (the Acknowledgements section thanks individuals and reviewers but does not mention funding)." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations (University of Notre Dame, Yale University) are listed on the first page." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": false, 217 "answer": false, 218 "justification": "No funding disclosed; appears to be unfunded academic work." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper does not evaluate any pre-trained model on a benchmark." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "The paper does not evaluate any pre-trained model on a benchmark." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "The paper does not evaluate any pre-trained model on a benchmark." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "Purely theoretical paper." 285 }, 286 "compute_budget_stated": { 287 "applies": false, 288 "answer": false, 289 "justification": "Purely theoretical paper." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "Masked hard-attention transformers with strict future masking and no position embeddings recognize exactly the star-free languages.", 296 "evidence": "Theorems 1-4 establish equivalence chains: LTL ↔ B-RASP ↔ masked hard-attention transformers. Full proofs in Appendices B and C.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Non-strict masking reduces expressivity to the stutter-invariant star-free languages.", 301 "evidence": "Theorem 6, building on Peled and Wilke (1997), with proof adaptation described in Section 5.2.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "With rational sinusoidal position embeddings, masked hard-attention transformers recognize exactly the regular languages in AC⁰.", 306 "evidence": "Corollary 8, proved via connection to FO[<, MOD] (Appendix D.2).", 307 "supported": "strong" 308 }, 309 { 310 "claim": "Adding more self-attention layers strictly increases expressive power at every depth level.", 311 "evidence": "Theorem 10, using separating languages STAIR_{k+1} from Etessami and Wilke (2000).", 312 "supported": "strong" 313 } 314 ], 315 "red_flags": [], 316 "cited_papers": [ 317 { 318 "title": "Attention is all you need", 319 "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar"], 320 "year": 2017, 321 "relevance": "Foundational transformer architecture paper that this work theoretically analyzes." 322 }, 323 { 324 "title": "Thinking like Transformers", 325 "authors": ["Gail Weiss", "Yoav Goldberg", "Eran Yahav"], 326 "year": 2021, 327 "relevance": "Introduced RASP programming language for reasoning about transformer computations; B-RASP builds on this." 328 }, 329 { 330 "title": "Theoretical limitations of self-attention in neural sequence models", 331 "authors": ["Michael Hahn"], 332 "year": 2020, 333 "relevance": "Prior work on expressivity bounds of transformers, showing hard-attention cannot compute parity." 334 }, 335 { 336 "title": "Formal language recognition by hard attention Transformers: Perspectives from circuit complexity", 337 "authors": ["Yiding Hao", "Dana Angluin", "Robert Frank"], 338 "year": 2022, 339 "relevance": "Established AC⁰ upper bound for hard-attention transformers; this paper resolves open questions from this work." 340 }, 341 { 342 "title": "The expressive power of transformers with chain of thought", 343 "authors": ["William Merrill", "Ashish Sabharwal"], 344 "year": 2024, 345 "relevance": "Studies transformer expressivity with chain-of-thought, another exact characterization result." 346 }, 347 { 348 "title": "Overcoming a theoretical limitation of self-attention", 349 "authors": ["David Chiang", "Peter Cholak"], 350 "year": 2022, 351 "relevance": "Shows soft-attention transformers can compute parity, contrasting with hard-attention limitations studied here." 352 }, 353 { 354 "title": "Learning Transformer programs", 355 "authors": ["Dan Friedman", "Alexander Wettig", "Danqi Chen"], 356 "year": 2023, 357 "relevance": "Studies learning in transformer-like programs; related to the associative recall task example in this paper." 358 } 359 ] 360 }