scan.json (28004B)
1 { 2 "paper": { 3 "title": "miniCodeProps: a Minimal Benchmark for Proving Code Properties", 4 "authors": ["Evan Lohn", "Sean Welleck"], 5 "year": 2024, 6 "venue": "arXiv.org", 7 "arxiv_id": "2406.11915", 8 "doi": "10.48550/arXiv.2406.11915" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "miniCodeProps is a 201-property benchmark in Lean 4 for evaluating neural theorem provers on code verification, sourced from Tons of Inductive Problems. GPT-4o (pass@32) proves 75.6% of easy properties but only 4.34% of medium+hard; a small specialized model (ntp-ctx-1.3B, 35.8% overall) matches or exceeds GPT-4o (34.8%). A many-to-one refinement technique yields modest gains (37.3% overall). The benchmark demonstrates that current LLM-based provers cannot handle even relatively simple code verification beyond basic induction.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Evaluation code released at https://github.com/cmu-l3/minicodeprops-eval, linked in footnote 1. The benchmark dataset is published on HuggingFace." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The benchmark is published as a public dataset on HuggingFace (Section 3.3: 'We have published our benchmark as a public dataset on Huggingface'). Source is derived from the public TIP repository." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided in the paper. The paper mentions Lean 4 but does not specify library versions or dependencies needed to run the evaluation." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper states evaluation code is publicly available but does not provide step-by-step reproduction instructions. No commands, scripts, or detailed procedure for replicating the baseline experiments are given in the paper." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Table 3 reports only point estimates (e.g., '75.6% (65/86)') with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are reported. Claims like ntp-ctx showing 'comparable, or better, performance than GPT-4o' are based solely on comparing raw percentages without any formal test." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Results in Table 3 report both percentages and raw counts (e.g., '75.6% (65/86)'), with baseline comparisons providing sufficient context to assess magnitude. The refinement improvement is reported as 77.9% vs 75.6% on easy and 6.96% vs 4.34% on medium+hard." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 3: 'We specifically make miniCodeProps fairly small, since evaluating theorem proving models requires many calls to the language model... is hence time-consuming and prohibitively expensive for large datasets.' Section 5: 'it is unlikely that these high-level conclusions would change given more properties drawn from the same distribution, thereby justifying miniCodeProps' size of 201 properties.'" 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Three baselines are compared: GPT-4o full proof generation (pass@32), GPT-4o with refinement, and ntp-ctx-1.3B tactic-by-tactic generation with best-first search (Table 3)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "GPT-4o was a highly performant general-purpose model at the time. ntp-ctx-1.3B is described as 'the only tactic generation model specifically trained to accept both the current proof state and an additional input context' at the time of experimentation (Section 4)." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No systematic ablation study is performed. The comparison between GPT-4o base and +refinement shows the effect of refinement, but no ablation of benchmark design choices, prompt formatting, or context components is conducted." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "Only one metric is used: pass rate (percentage of specifications proven). No secondary metrics such as proof length, generation time, or partial progress measures are reported." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is irrelevant here — the Lean 4 kernel provides a definitive, formal verification of proof correctness." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "The refinement parameters (k=16, T=2) were selected via 'a small scale experiment' on an unspecified dataset, possibly the benchmark itself. No explicit dev/test split is described, and no held-out validation procedure is documented." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 3 breaks down results by Medley (Easy), Termination + Sorting (Medium + Hard), and Overall. Table 1 provides composition breakdown by subject matter (Numbers, Lists, Trees, Heaps) and difficulty level." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Appendix A shows a detailed incorrect proof from GPT-4 for a sorting property, discussing specific failure modes including invalid induction strategies and placeholder arguments. Section 3.3 discusses what makes certain properties difficult." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The core finding is negative: all methods fail on nearly all medium and hard properties (4.34%-8.69% on medium+hard). The paper is designed to demonstrate the limitations of current methods." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims are well-supported: state-of-the-art methods show promise on easy properties (72-78%) yet fail on medium and hard (4-9%). Table 3 directly supports these claims." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's causal claims are modest. 'Changing the generation algorithm (namely, to refinement) leads to some additional successes' is supported by a controlled comparison (same model, same benchmark, added refinement). The comparison is adequate for this claim." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 5 'Scope': 'performing well on this subset is only a proxy for the ability to perform well on all realistic code properties.' The title ('Minimal Benchmark') and throughout the paper, claims are bounded to the tested setting of simple Lean 4 properties." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not substantively discuss alternative explanations for why methods fail on medium/hard properties. Possible confounds — such as proof length vs. conceptual difficulty, or the effect of Lean-specific syntax unfamiliarity — are not explored." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures exactly what it claims: whether neural provers can generate correct formal proofs for code properties in Lean 4. No proxy gap exists — pass/fail on the Lean kernel directly measures the stated capability." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper says 'GPT-4o' without a specific API version or snapshot date. ntp-ctx-1.3B is named specifically as a DeepSeek-Coder 1.3B fine-tune, but GPT-4o lacks version specificity per the schema criterion." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper describes what information is provided in prompts ('dependent code and the property being proven') but does not provide the actual prompt text or templates used for GPT-4o or ntp-ctx-1.3B. For refinement, the prompt is described in natural language ('learn from the failed attempts') without the actual text." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "GPT-4o uses 'default generation parameters' without specifying temperature, top-p, or max tokens. Refinement uses k=16, T=2. ntp-ctx references [37] for search settings. Key generation hyperparameters for GPT-4o are missing." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 4 describes two proving modes: full-proof generation (sample 32 candidates, check with Lean kernel) and tactic-by-tactic best-first search. The refinement technique is described as a 'many-to-one' approach with k=16 samples over T=2 rounds. The workflow is clear enough to understand." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3.1 describes the collection process: filtered TIP files to remove primarily mathematical induction, translated remaining Haskell to Lean 4, associated metadata. Section 3.2 describes the three categories and Table 1 provides the breakdown." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 5 contains a 'Scope' subsection that substantively discusses limitations, including that the benchmark is only a proxy for realistic code properties and is sourced from a single dataset (TIP)." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The Scope subsection discusses threats specific to this study: the benchmark is sourced from TIP (may not represent all code properties), it tests self-contained problems rather than complex projects, and good performance is 'only a proxy for the ability to perform well on all realistic code properties.'" 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states it tests self-contained properties rather than complex verification projects, focuses on Lean 4 only, and draws from a single source (TIP). Section 1: 'Rather than evaluating on complex projects, our goal is to find a minimal set of meaningful program properties that breaks current LLM-based provers.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The benchmark is publicly released on HuggingFace and GitHub, including all 201 property definitions, metadata, and evaluation code." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.1 describes the collection: properties sourced from TIP, translated from Haskell to Lean 4, filtered to focus on programming rather than mathematical induction, with metadata associated per Table 2." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data source is the public TIP benchmark repository." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 3.1-3.2 document the pipeline: TIP Haskell source → filter mathematical files → translate to Lean 4 → write termination lemmas → associate metadata → format as JSONL. Table 1 provides the composition breakdown (201 properties from 95 functions)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Acknowledgements: 'Sean Welleck thanks Convergent Research, the Lean FRO, and the OpenAI Researcher Access Program for their support.'" 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Both authors are listed as Carnegie Mellon University. No affiliation with OpenAI or other companies whose products are evaluated." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "The OpenAI Researcher Access Program provided support, and GPT-4o (an OpenAI product) is one of the primary baselines evaluated. OpenAI has a financial interest in GPT-4o's perceived capabilities." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff date is stated for GPT-4o or for the DeepSeek-Coder base of ntp-ctx-1.3B." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The benchmark is newly translated into Lean 4, providing some natural contamination protection, but this is not explicitly discussed. The underlying TIP problems (Haskell, 2015) could appear in model training data." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "TIP was published in 2015, well before GPT-4o's training. While the Lean 4 translations are new, the logical structure and solutions are publicly available in Haskell. This contamination risk is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No API costs, token counts, or wall-clock time reported despite extensive GPT-4o usage (32 candidates × 201 properties + refinement rounds)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget stated for GPT-4o API calls or ntp-ctx-1.3B inference." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Results appear to be from a single experimental run. No seed sensitivity analysis or results across multiple random seeds are reported." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "The number of generation attempts is clearly stated: pass@32 for GPT-4o (32 candidates), k=16 per round with T=2 rounds for refinement. ntp-ctx uses settings from [37]." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "GPT-4o uses 'default generation parameters' with no search. Refinement parameters (k=16, T=2) were selected via 'a small scale experiment' but no search budget or number of configurations tried is reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The refinement configuration is justified only by 'In a small scale experiment we confirmed that this outperformed sampling 64 candidates without refinement.' No details of this experiment are provided, and it may have been conducted on the test set." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors created both the benchmark and the refinement baseline, and evaluate all methods themselves. No discussion of potential author-evaluation bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "GPT-4o pass@32, GPT-4o+refinement (32+32 additional), and ntp-ctx-1.3B best-first search use very different compute budgets. This is not discussed or controlled for." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 5 'Scope' explicitly discusses construct validity: 'performing well on this subset is only a proxy for the ability to perform well on all realistic code properties.' The paper justifies the benchmark design as testing 'minimum level of competency' (Section 3.2)." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "GPT-4o uses full proof generation while ntp-ctx-1.3B uses tactic-by-tactic best-first search — fundamentally different scaffolding. The paper notes ntp-ctx shows 'comparable, or better, performance' without controlling for the scaffold difference." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "TIP was published in 2015 and the Haskell source code is publicly available. GPT-4o could have seen the original problems and solutions during training. This temporal leakage risk is not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The evaluation provides code dependencies as context, which is necessary for the task. However, whether the provided context could inadvertently leak proof strategies is not discussed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "All 201 properties are drawn from the same source (TIP) and share structural patterns (inductive proofs over lists, trees, natural numbers). Potential non-independence across benchmark items is not discussed." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is applied. The Lean 4 translation provides some natural protection but this is incidental, not a deliberate decontamination measure." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "miniCodeProps is sufficient to break current LLM-based provers, with state-of-the-art methods failing to prove nearly all medium and hard properties.", 365 "evidence": "Table 3: GPT-4o proves 4.34% (5/115) of medium+hard properties; ntp-ctx-1.3B proves 8.69% (10/115). Even with refinement, GPT-4o only reaches 6.96% (8/115).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Current neural theorem provers show nontrivial performance on easy (medley) properties, demonstrating some ability for basic inductive proofs in code verification.", 370 "evidence": "Table 3: GPT-4o proves 75.6% (65/86) of easy properties; ntp-ctx-1.3B proves 72.1% (62/86); GPT-4o+refinement proves 77.9% (67/86).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "The small ntp-ctx-1.3B model trained on Mathlib shows comparable or better overall performance than GPT-4o.", 375 "evidence": "Table 3: ntp-ctx-1.3B achieves 35.8% overall vs GPT-4o's 34.8%. On medium+hard, ntp-ctx achieves 8.69% vs GPT-4o's 4.34%.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Refinement (many-to-one technique) leads to additional successes on medium and hard properties.", 380 "evidence": "Table 3: GPT-4o+refinement proves 8/115 medium+hard properties vs 5/115 without refinement (3 additional proofs).", 381 "supported": "weak" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "No uncertainty quantification", 387 "detail": "All results in Table 3 are single-run point estimates with no error bars, confidence intervals, or variance across runs. Pass@32 involves stochastic sampling, so results could vary across runs." 388 }, 389 { 390 "flag": "No cost reporting despite extensive API usage", 391 "detail": "The evaluation requires 32 GPT-4o generations per property (201 properties) plus refinement rounds, yet no API costs or compute time are reported. This makes it difficult to assess practical reproducibility." 392 }, 393 { 394 "flag": "Possible test-set tuning for refinement parameters", 395 "detail": "Refinement parameters (k=16, T=2) were selected via 'a small scale experiment' without specifying whether this was on a held-out set or the benchmark itself. If tuned on the test set, the refinement results are optimistically biased." 396 }, 397 { 398 "flag": "Contamination risk from public TIP source", 399 "detail": "The benchmark is translated from TIP (2015 Haskell problems). While Lean 4 translations are new, the underlying logical problems and solutions are publicly available and may appear in GPT-4o's training data. This is not discussed." 400 } 401 ], 402 "cited_papers": [ 403 { 404 "title": "Clover: Closed-loop verifiable code generation", 405 "authors": ["Chuyue Sun", "Ying Sheng", "Oded Padon", "Clark Barrett"], 406 "year": 2024, 407 "relevance": "LLM-based approach for automated code verification in Dafny, directly relevant to AI-driven formal verification." 408 }, 409 { 410 "title": "Evaluating large language models trained on code", 411 "authors": ["Mark Chen", "Jerry Tworek"], 412 "year": 2021, 413 "relevance": "Foundational Codex/HumanEval paper for LLM code generation evaluation." 414 }, 415 { 416 "title": "Baldur: Whole-proof generation and repair with large language models", 417 "authors": ["Emily First", "Markus N. Rabe", "Talia Ringer", "Yuriy Brun"], 418 "year": 2023, 419 "relevance": "LLM-based proof generation and repair for formal verification in Isabelle, closely related benchmark work." 420 }, 421 { 422 "title": "LeanDojo: Theorem proving with retrieval-augmented language models", 423 "authors": ["Kaiyu Yang", "Aidan Swope", "Alex Gu"], 424 "year": 2023, 425 "relevance": "Retrieval-augmented theorem proving infrastructure for Lean, relevant to AI-assisted formal verification." 426 }, 427 { 428 "title": "An in-context learning agent for formal theorem-proving", 429 "authors": ["Amitayush Thakur", "George Tsoukalas", "Yeming Wen", "Jimmy Xin", "Swarat Chaudhuri"], 430 "year": 2024, 431 "relevance": "COPRA agent for language-agnostic theorem proving, tested on code verification including CompCert." 432 }, 433 { 434 "title": "Generative language modeling for automated theorem proving", 435 "authors": ["Stanislas Polu", "Ilya Sutskever"], 436 "year": 2020, 437 "relevance": "Foundational work on LLM-based automated theorem proving with best-first search." 438 }, 439 { 440 "title": "minictx: Neural theorem proving with (long-)contexts", 441 "authors": ["Jiewen Hu", "Thomas Zhu", "Sean Welleck"], 442 "year": 2024, 443 "relevance": "Provides the ntp-ctx-1.3B model used as a baseline, context-aware tactic generation for theorem proving." 444 }, 445 { 446 "title": "Llemma: An open language model for mathematics", 447 "authors": ["Zhangir Azerbayev", "Hailey Schoelkopf", "Keiran Paster"], 448 "year": 2024, 449 "relevance": "Open language model for mathematical reasoning used in theorem proving, relevant to AI capability benchmarking." 450 }, 451 { 452 "title": "Leveraging large language models for automated proof synthesis in Rust", 453 "authors": ["Jianan Yao", "Ziqiao Zhou", "Weiteng Chen", "Weidong Cui"], 454 "year": 2023, 455 "relevance": "LLM-based proof automation for Verus/Rust verification, complementary approach to ITP-based verification." 456 }, 457 { 458 "title": "minif2f: a cross-system benchmark for formal olympiad-level mathematics", 459 "authors": ["Kunhao Zheng", "Jesse Michael Han", "Stanislas Polu"], 460 "year": 2022, 461 "relevance": "Influential minimal benchmark for theorem proving that directly inspired miniCodeProps' design and naming." 462 }, 463 { 464 "title": "Draft, sketch, and prove: Guiding formal theorem provers with informal proofs", 465 "authors": ["Albert Qiaochu Jiang", "Sean Welleck"], 466 "year": 2023, 467 "relevance": "Prompting strategy for theorem proving using informal proofs, relevant to AI-assisted formal verification." 468 } 469 ], 470 "engagement_factors": { 471 "practical_relevance": { 472 "score": 1, 473 "justification": "Primarily of interest to theorem proving researchers; not directly usable by practitioners for everyday code verification." 474 }, 475 "surprise_contrarian": { 476 "score": 1, 477 "justification": "Confirms the expected finding that LLM provers struggle beyond simple cases, though the small-model-matching-GPT-4o result is mildly surprising." 478 }, 479 "fear_safety": { 480 "score": 0, 481 "justification": "No AI safety or security concerns raised; the paper is about formal verification capabilities." 482 }, 483 "drama_conflict": { 484 "score": 0, 485 "justification": "No controversy or conflict; straightforward benchmark contribution." 486 }, 487 "demo_ability": { 488 "score": 2, 489 "justification": "Code and benchmark are publicly released on GitHub and HuggingFace, allowing researchers to run evaluations." 490 }, 491 "brand_recognition": { 492 "score": 1, 493 "justification": "CMU authors have some recognition in the theorem proving community; GPT-4o is well-known but the paper itself is niche." 494 } 495 } 496 }