scan.json (24882B)
1 { 2 "paper": { 3 "title": "Automated Formalization via Conceptual Retrieval-Augmented LLMs", 4 "authors": [ 5 "Wangyue Lu", 6 "Lun Du", 7 "Sirui Li", 8 "Ke Weng", 9 "Haozhe Sun", 10 "Hengyu Liu", 11 "Minghe Yu", 12 "Tiancheng Zhang", 13 "Ge Yu" 14 ], 15 "year": 2025, 16 "venue": "arXiv preprint", 17 "arxiv_id": "2508.06931" 18 }, 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The paper describes a framework (CRAMF) and a knowledge base but provides no link to access them." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper uses public datasets miniF2F and ProofNet, but also introduces two new datasets (AdvancedMath with 173 problems and CombMath with 241 problems) without providing download links or release information for either." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed library version information is provided. The paper mentions specific models (MathBERT, bge-reranker-v2-m3, Faiss) but does not specify versions or environment setup details." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method description covers the framework architecture but not how to run it." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results in Tables 1-6 are reported as single point estimates (e.g., '69.3%', '47.1%') with no confidence intervals, error bars, or uncertainty measures." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims CRAMF 'significantly improves' and 'consistently improves' performance over baselines but provides no statistical significance tests (no p-values, no tests). All comparisons are based solely on comparing raw percentages." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper reports relative gain (RG) percentages with baseline context throughout Tables 1 and 2 (e.g., 'from 36.9% to 47.1%, +27.6% relative improvement' for DeepSeek-V3 on MiniF2F). Both baseline and improved values are provided, allowing the reader to assess magnitude." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The datasets have fixed sizes (miniF2F, ProofNet are standard benchmarks; AdvancedMath has 173 problems; CombMath has 241 problems) but no justification is given for why these sizes are sufficient for the claims made. No power analysis is discussed." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No standard deviation, variance, or spread measures are reported. It is unclear whether results are from single runs or averaged over multiple runs. All tables show single point estimates only." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper includes baselines for both autoformalization (DeepSeek-V3, GPT-4o, Herald-7B, Kimina-7B without CRAMF) and retrieval (BM25, Rewrite-Retrieve-Read, HyDE), shown in Tables 1-4." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Baselines include recent models: DeepSeek-V3 (2024), GPT-4o (2024), Herald-7B (2024), Kimina-7B (2025). Retrieval baselines include HyDE (2023) and R3 (2023). These are contemporary and relevant." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table 5 presents an ablation study removing individual CRAMF components (MCP, DCR, Rerank) and measuring the impact on FAR@10. Table 6 shows the effect of the rewriting mechanism on CombMath." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses multiple evaluation metrics: Compilation Pass Rate@10 (CPR@10), Formalization Accuracy Rate@10 (FAR@10), Average Contribution Score (ACS), and HitRate@3." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "All evaluation is automated. Formalization accuracy is judged by DeepSeek-V3 for semantic consistency and the Lean compiler for compilation. No human evaluation of output quality is conducted, despite the paper claiming semantic accuracy improvements where human judgment could add value." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "The paper does not discuss train/test splits or held-out sets for its knowledge base construction or retrieval pipeline. It is unclear whether any tuning decisions were made on the same data used for final evaluation." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down per dataset (miniF2F, ProofNet, AdvancedMath) and per model (DeepSeek-V3, GPT-4o, Herald-7B, Kimina-7B) in Tables 1 and 2. Per-component breakdowns are in the ablation study (Table 5)." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Figure 1 presents concrete failure cases illustrating model hallucination (undefined predicates) and semantic gap (conceptual polymorphism of 'neighborhood'). The introduction discusses specific failure modes in detail." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": false, 113 "justification": "Every experiment shows improvement with CRAMF. Even the ablation study only shows degradation when components are removed (which confirms positive contribution). No approaches that were tried and failed or configurations that did not work are reported." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims 'up to 62.1% and an average of 29.9% relative improvement.' Table 2 confirms the 62.1% figure (DeepSeek-V3 on AdvancedMath) and the overall average relative improvement across all model-dataset combinations is consistent with the 29.9% claim." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper makes causal claims about CRAMF components via ablation studies (Table 5, Table 6). The ablations systematically remove individual components (MCP, DCR, Rerank, ReWrite) and measure the effect, constituting controlled single-variable manipulation. The overall claim that CRAMF improves formalization is supported by the before/after comparison." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper claims CRAMF 'effectively suppresses model hallucinations and semantic gaps' and demonstrates 'robustness and generalization capability' in the conclusion, but all experiments are on Lean 4 formalization only. The title says 'Automated Formalization' generally without bounding to Lean 4 or Mathlib specifically. The conclusion mentions extending to 'more complex mathematical domains' as future work but doesn't bound current claims." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "No alternative explanations for the improvements are discussed. For example, the improvement could partly stem from providing any additional context (not necessarily concept-level retrieval), or from the specific LLMs used for evaluation. No robustness checks or confound analysis is presented." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper uses 'GPT-4o' and 'DeepSeek-V3' without specifying API versions or snapshot dates. 'Herald-7B' and 'Kimina-7B' are referenced by paper citation but no specific model checkpoints or versions are given. 'InternLM-Math-7B' and 'InternLM2-Math-Plus-7B' are used without version specifics. 'MathBERT' and 'bge-reranker-v2-m3' lack version details." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper states 'Details on prompt construction and examples are provided in the appendix' for concept extraction, and mentions prompts for ACS scoring are in the Appendix. However, the paper text available does not include the full appendix with actual prompt text. The main text describes prompts only in natural language ('we employ a query enhancement method. Using LLMs under strict prompting')." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No temperature, top-p, max tokens, or other LLM API settings are reported. The paper mentions 'Top-10' and 'Top-5' and 'Top-3' for retrieval parameters but no inference hyperparameters for the LLMs." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The CRAMF pipeline is described in detail in the Method section: knowledge base construction, concept extraction, query enhancement, dual-pathway hybrid retrieval with reranking, and integration with the autoformalizer. Figure 2 provides an overview diagram. The retrieval and reranking workflow is clearly laid out." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The knowledge base construction pipeline is documented: parsing Mathlib with doc-gen4, extracting definitions via def/class/structure, back-translation with InternLM-Math-7B, self-consistency validation, and concept extraction with DeepSeek-V3. The paper states it covers 'over 26,000 formal definitions and 1,000+ core mathematical concepts.'" 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated Limitations, Threats to Validity, or similar section. The conclusion briefly mentions future work ('extend to more complex mathematical domains') but does not discuss limitations of the current work." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of potential biases in the evaluation methodology (e.g., using DeepSeek-V3 for both ACS scoring and as a baseline model)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show (e.g., no claim about proof generation, only formalization; limited to Lean 4; specific to Mathlib). The conclusion's mention of 'future work' is not the same as bounding current claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data, experimental logs, or per-example results are made available. Only aggregate results in tables are presented." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The knowledge base construction from Mathlib is described in detail. For the AdvancedMath dataset, the paper states it 'consists of 173 informalized proof problems in higher mathematics.' For CombMath, it is described as '241 combinatorics problems derived primarily from the textbook Combinatorics: The Art of Counting.' MiniF2F and ProofNet are cited standard benchmarks." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are involved. The study uses automated benchmarks and datasets." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The data pipeline from Mathlib parsing to knowledge base construction is documented step-by-step: doc-gen4 parsing, back-translation, self-consistency validation, concept extraction, vector encoding, and index construction. The evaluation pipeline (Lean compilation, back-translation, semantic consistency) is also described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information, acknowledgments, or grant numbers are mentioned anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Northeastern University, Ant Research Institute (Ant Group), and Aalborg University. The Ant Group affiliation is disclosed." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. One author is from Ant Group (Ant Research Institute), which has a potential interest in AI/LLM capabilities, but no funding relationship is disclosed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial disclosures are present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper evaluates GPT-4o, DeepSeek-V3, Herald-7B, and Kimina-7B on miniF2F and ProofNet benchmarks but does not state any training data cutoff dates for these models." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the benchmark problems (miniF2F, ProofNet) may have appeared in the training data of the evaluated models. MiniF2F was published in 2021 and ProofNet in 2023, both potentially within training data of models released in 2024-2025." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "MiniF2F (2021) and ProofNet (2023) are publicly available and could be in the training data of GPT-4o and DeepSeek-V3. The paper does not address this contamination risk at all. While CRAMF is compared as a relative improvement, the base model performance could be inflated by contamination." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "CRAMF involves multiple LLM calls (concept extraction, query enhancement, keyword generation) plus retrieval and reranking per problem, but no inference cost, latency, or token consumption is reported." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No computational budget, GPU hours, API costs, or hardware specifications are mentioned. Building the knowledge base from 26,000+ Mathlib definitions also involves significant compute that is not quantified." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "CRAMF achieves up to 62.1% and an average of 29.9% relative improvement in formalization accuracy across models and benchmarks.", 296 "evidence": "Table 2 shows FAR@10 relative gains: DeepSeek-V3 achieves 62.1% on AdvancedMath. Average across all 12 model-dataset combinations is approximately 30%.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "CRAMF significantly outperforms all retrieval baselines (BM25, R3, HyDE) in both ACS and HitRate@3.", 301 "evidence": "Tables 3 and 4 show CRAMF achieves ACS of 1.93-2.14 vs. HyDE's 1.52-1.61, and HitRate@3 of 42.9-50.6% vs. HyDE's 30.7-35.5%.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Dual-channel retrieval is the most critical component, with its removal causing the largest performance degradation (average FAR drop of 15.7%).", 306 "evidence": "Table 5 shows w/o DCR reduces FAR@10 from 63.1/55.1/51.4 to 48.5/36.8/37.1 across three datasets.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "The problem rewriting mechanism improves FAR by 8.8% on combinatorics problems with implicit mathematical concepts.", 311 "evidence": "Table 6 shows CRAMF achieves 63.1% vs. 54.3% without rewriting on CombMath.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "CRAMF serves as a plug-and-play enhancement for LLM-based autoformalizers, consistently improving all tested models.", 316 "evidence": "Tables 1 and 2 show improvements for all four models (DeepSeek-V3, GPT-4o, Herald-7B, Kimina-7B) on all three datasets, though gains for Kimina-7B are very small (0.4-8.5%).", 317 "supported": "moderate" 318 } 319 ], 320 "methodology_tags": [ 321 "benchmark-eval" 322 ], 323 "key_findings": "CRAMF is a retrieval-augmented framework that constructs a concept-definition knowledge base from Mathlib4 (26,000+ definitions, 1,000+ concepts) and uses dual-channel hybrid retrieval with reranking to provide contextual grounding for LLM-based mathematical autoformalization. Experiments on miniF2F, ProofNet, and AdvancedMath show consistent improvements in both compilation pass rate and formalization accuracy across four base models, with the largest gains for general-purpose models like DeepSeek-V3 (up to 62.1% relative improvement). Ablation studies confirm that dual-channel retrieval contributes the most, followed by reranking and concept parsing.", 324 "red_flags": [ 325 { 326 "flag": "No uncertainty quantification", 327 "detail": "All results are single point estimates with no error bars, confidence intervals, standard deviations, or indication of multiple runs. It is unclear whether results are from single runs or averages, making it impossible to assess statistical reliability." 328 }, 329 { 330 "flag": "No significance testing", 331 "detail": "The paper uses language like 'significantly outperforms' and 'significant improvement' without any statistical significance tests. All claims of superiority are based solely on comparing raw percentages." 332 }, 333 { 334 "flag": "Evaluator conflict of interest", 335 "detail": "DeepSeek-V3 is used both as an evaluation judge (for semantic consistency assessment in FAR@10) and as one of the baseline models being evaluated. This creates a potential bias where the evaluator may favor its own outputs." 336 }, 337 { 338 "flag": "No limitations section", 339 "detail": "The paper has no limitations, threats to validity, or discussion of potential weaknesses. This is a significant omission for a methods paper." 340 }, 341 { 342 "flag": "Benchmark contamination risk unaddressed", 343 "detail": "MiniF2F (2021) and ProofNet (2023) are publicly available datasets that could appear in the training data of GPT-4o and DeepSeek-V3. The paper does not discuss this risk, though the relative comparison (with vs. without CRAMF) partially mitigates this concern for the improvement claims." 344 }, 345 { 346 "flag": "No cost analysis despite multi-step LLM pipeline", 347 "detail": "CRAMF requires multiple LLM calls per problem (concept extraction, query enhancement, keyword generation, back-translation, semantic consistency checking) plus retrieval and reranking. No cost or latency analysis is provided, making practical applicability hard to assess." 348 }, 349 { 350 "flag": "New datasets not released", 351 "detail": "AdvancedMath (173 problems) and CombMath (241 problems) are introduced but not released, limiting reproducibility and independent verification." 352 } 353 ], 354 "cited_papers": [ 355 { 356 "title": "Autoformalization with large language models", 357 "authors": ["Yuhuai Wu", "Albert Qiaochu Jiang", "Wenda Li", "Markus Rabe", "Charles Staats", "Mateja Jamnik", "Christian Szegedy"], 358 "year": 2022, 359 "relevance": "Foundational work on using LLMs for autoformalization, directly relevant to LLM code generation capability evaluation." 360 }, 361 { 362 "title": "Retrieval-augmented generation for large language models: A survey", 363 "authors": ["Yunfan Gao"], 364 "year": 2023, 365 "arxiv_id": "2312.10997", 366 "relevance": "Comprehensive survey on RAG techniques that ground LLM outputs, relevant to understanding LLM scaffolding approaches." 367 }, 368 { 369 "title": "DeepSeek-V3 technical report", 370 "authors": ["DeepSeek"], 371 "year": 2024, 372 "arxiv_id": "2412.19437", 373 "relevance": "Technical report for a major open-source LLM used as both baseline and evaluator in this study." 374 }, 375 { 376 "title": "LeanDojo: Theorem proving with retrieval-augmented language models", 377 "authors": ["Kaiyu Yang", "Aidan Swope", "Alex Gu"], 378 "year": 2023, 379 "relevance": "Retrieval-augmented approach to theorem proving using premise selection, relevant to LLM-based formal reasoning evaluation." 380 }, 381 { 382 "title": "Kimina-prover preview: Towards large formal reasoning models with reinforcement learning", 383 "authors": ["Haiming Wang"], 384 "year": 2025, 385 "arxiv_id": "2504.11354", 386 "relevance": "State-of-the-art formal reasoning model using RL, relevant to evaluating LLM capabilities in mathematical code generation." 387 }, 388 { 389 "title": "Rethinking and improving autoformalization: towards a faithful metric and a dependency retrieval-based approach", 390 "authors": ["Qian Liu", "Xueliang Zheng"], 391 "year": 2025, 392 "relevance": "Proposes improved metrics and retrieval for autoformalization, directly relevant to evaluation methodology in LLM code generation." 393 }, 394 { 395 "title": "Advancing theorem proving in LLMs through large-scale synthetic data", 396 "authors": ["Huajian Xin", "Daya Guo", "Zhihong Shao"], 397 "year": 2024, 398 "relevance": "Explores synthetic data for improving LLM theorem proving, relevant to LLM training methodology for code tasks." 399 }, 400 { 401 "title": "Lean workbook: A large-scale lean problem set formalized from natural language math problems", 402 "authors": ["Huajian Ying"], 403 "year": 2024, 404 "relevance": "Large-scale benchmark for Lean formalization used as the evaluation pipeline in this study." 405 }, 406 { 407 "title": "Query rewriting in retrieval-augmented large language models", 408 "authors": ["Xinbei Ma", "Yeyun Gong", "Pengcheng He"], 409 "year": 2023, 410 "relevance": "Query rewriting technique for RAG used as a retrieval baseline, relevant to LLM scaffolding methods." 411 }, 412 { 413 "title": "Precise zero-shot dense retrieval without relevance labels", 414 "authors": ["Luyu Gao", "Xueguang Ma", "Jimmy Lin", "Jamie Callan"], 415 "year": 2023, 416 "relevance": "HyDE method used as a retrieval baseline, relevant to zero-shot retrieval approaches for LLM augmentation." 417 } 418 ] 419 }