scan.json (23791B)
1 { 2 "paper": { 3 "title": "TyFlow: A Type-Aware Approach to Neural Code Models", 4 "authors": [ 5 "Zhechong Huang", 6 "Zhao Zhang", 7 "Ruyi Ji", 8 "Tingxuan Xia", 9 "Qihao Zhu", 10 "Qinxiang Cao", 11 "Zeyu Sun", 12 "Wiggin Zhou", 13 "Yingfei Xiong" 14 ], 15 "year": 2026, 16 "venue": "ACM Transactions on Software Engineering and Methodology", 17 "arxiv_id": "2510.10216" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "No repository URL or code archive is provided anywhere in the paper." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": false, 31 "justification": "The SuFu dataset is sourced from a GitHub repo (jiry17/SuFu) and MBJP is referenced at amazon-science/mxeval, but TyFlow's own processed datasets, natural language descriptions, and training splits are not released." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": true, 36 "justification": "Sec. 6.1.4 specifies: '2 × AMD EPYC 9655 CPUs, 512 GB DDR5 RAM, and 8 × NVIDIA GeForce RTX 4090 GPUs, running Ubuntu 22.04.5 LTS with CUDA 12.8.' Hardware is detailed, though software dependencies beyond CUDA are not listed." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions, scripts, or README are provided." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "Results in Tables 2-5 report only point estimates with no confidence intervals or error bars." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims TyFlow 'improves' over baselines based on comparing numbers without any statistical significance tests." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Tables 2-5 report absolute values and deltas (e.g., Table 3 shows Δ columns). Improvements are contextualized with baseline values (e.g., 'improves pass@10 from 32.76% to 46.55%')." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "SuFu has 58 test tasks and MBJP has ~61 test tasks. These small sizes are not justified or discussed." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No variance, standard deviation, or spread measures across runs are reported. It is unclear whether results are from single or multiple runs." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The paper compares against CodeT5-220M, T5Gemma2-2B, rejection sampling, and Type-First/Code-First variants (Tables 2, 4, 5)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": false, 80 "justification": "CodeT5 (2021) is not contemporary. T5Gemma2-2B (2025) is more recent, but the paper does not compare against state-of-the-art code generation models like larger LLMs or dedicated code models beyond these two base models." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Table 3 provides a sequential ablation study incrementally adding syntactic pruning, type pruning, and dynamic typing context." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Four metrics are used: pass@1, pass@10, FSP, and CER (Sec. 6.1.3)." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "No human evaluation of generated code quality is performed. Evaluation is entirely automated via test cases." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "SuFu uses 80/20 train/test split and MBJP uses 90/10 split (Sec. 6.1.1). Best checkpoint is selected during training, presumably on validation data." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down per language (SuFu vs Java) in all tables, and per component in the ablation study (Table 3)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Fig. 1 shows a failure case of CodeT5 and constrained decoding. The paper discusses CER residuals in Java (3.52%) due to uncaptured static analysis constraints like unreachable code." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The Type-First and Code-First approaches are shown to still have high CER and lower performance (Table 5), and rejection sampling is shown to provide minimal improvement (Table 4)." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims TyFlow 'eliminates type errors' and 'significantly improves functional correctness,' both supported by Tables 2-5 showing CER reduction and pass@k improvements." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Causal claims about components are backed by controlled ablation study (Table 3) where each component is added incrementally in isolation." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper tests on two specific languages (SuFu and a Java subset) and states results for those contexts. It acknowledges the Java subset omits advanced features. The title 'Neural Code Models' is broad but the evaluation is scoped." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "No discussion of alternative explanations for the improvements. Could the gains come from the additional fine-tuning phase on OpenCoder data (Sec. 6.1.4) rather than the type-guided approach? This confound is not addressed." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures pass@k (functional correctness via test cases) and CER (compilation error rate) and frames claims at that level. It does not overclaim beyond what these metrics measure." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "CodeT5-220M and T5Gemma2-2B are specified with their parameter counts and architectural details (12-layer, 768 hidden dim, etc.) in Sec. 5.1." 150 }, 151 "prompts_provided": { 152 "applies": false, 153 "answer": false, 154 "justification": "TyFlow does not use prompting — it fine-tunes encoder-decoder models on synthesis decision sequences. The 'prompts' in the paper refer to task-level natural language descriptions, not LLM prompts." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "Sec. 6.1.4 mentions 'standard procedure with the same set of hyperparameters' and 'fine-tuned until convergence' but does not report specific hyperparameters (learning rate, batch size, beam width, etc.)." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. TyFlow is a synthesis system with a custom model architecture, not an agent pipeline." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Sec. 6.1.1 describes how SuFu programs were collected from GitHub with manually supplied NL descriptions, and the Java subset covers 608/~780 MBJP tasks. Sec. 6.1.4 mentions an initial fine-tuning phase on OpenCoder data. Appendix B provides detailed grammar specifications." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": false, 176 "justification": "No dedicated limitations or threats-to-validity section exists in the paper." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "No specific threats to validity are discussed anywhere in the paper." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges the Java subset omits advanced features but does not systematically discuss scope boundaries." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "No raw experimental data (generated programs, individual test results) is available." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Sec. 6.1.1 and Appendix B describe the data sources: SuFu from jiry17/SuFu GitHub repo (290 programs), MBJP from amazon-science/mxeval (608 tasks). NL descriptions for SuFu were generated by GPT-o3-mini and then human-reviewed." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. Data comes from standard benchmark datasets." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The paper documents: collection from GitHub/MBJP → manual NL description creation → train/test split (80/20 SuFu, 90/10 MBJP) → type-checking to produce derivation trees → synthesis decision sequence extraction → fine-tuning." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding or acknowledgments section is present in the paper." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are listed: Peking University, University of Michigan, Shanghai Jiao Tong University, Chinese Academy of Sciences, and Tencent." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "One author is affiliated with Tencent (industry). No funding disclosure makes independence impossible to assess." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial disclosure statement is present." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "The base models CodeT5 and T5Gemma2 are pre-trained models fine-tuned on benchmark data. No training data cutoff is discussed." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether MBJP test problems could have been in CodeT5 or T5Gemma2 pre-training data." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "MBJP/MBPP is a well-known public benchmark that could be in pre-training data of both base models. This is not discussed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "Token counts are compared (Table 5: 410 vs 752 tokens) but no wall-clock time, latency, or cost-per-example is reported." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Hardware is described (8x RTX 4090) but total training time, GPU hours, or compute budget is not stated." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of multiple random seeds or sensitivity analysis. Train/test splits are described as random but it is unclear if results are from a single split/seed." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is not stated anywhere." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "Sec. 6.1.4 mentions 'multiple checkpoints saved' and 'best-performing checkpoint is used' but no search budget or number of configurations tried is reported." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": false, 318 "justification": "'The best-performing checkpoint is used for evaluation' (Sec. 6.1.4) but no detail on what criterion was used for selection (validation set? test set?)." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors implement all baselines (CodeT5, rejection sampling, Type-First, Code-First) themselves without acknowledging self-comparison bias." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "TyFlow adds a synthesis system and dynamic re-encoding at each step, likely increasing compute. This overhead is not quantified or compared." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "MBJP's suitability for measuring type-aware code generation is not discussed. The paper does not question whether pass@k on MBJP actually measures type reasoning capability." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved; TyFlow is a model architecture, not an agent." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "MBJP is a public benchmark. Whether CodeT5 or T5Gemma2 saw MBJP problems during pre-training is not discussed." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the evaluation setup leaks information." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of train/test independence beyond stating the split ratios." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No leakage detection or prevention method is applied." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "TyFlow eliminates type errors (CER = 0.00%) on SuFu and greatly reduces them on Java (from 38.51%/15.22% to 3.52%/3.12%).", 372 "evidence": "Table 2 (Sec. 6.2.1): CER drops to 0.00% for both TyFlow variants on SuFu; Java CER drops from 38.51% to 3.52% (220M) and 15.22% to 3.12% (2B).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "TyFlow significantly improves functional correctness (pass@k) over baseline models.", 377 "evidence": "Table 2: SuFu pass@10 improves from 32.76% to 46.55% (220M) and 37.93% to 50.00% (2B). Java pass@10 improves from 20.90% to 28.36% (220M) and 35.82% to 40.30% (2B).", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "TyFlow outperforms rejection sampling, which provides only minor improvements over vanilla generation.", 382 "evidence": "Table 4: Rejection sampling barely improves pass@1/10 on either benchmark, while TyFlow achieves substantial gains.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "TyFlow's integrated approach outperforms separated Type-First and Code-First generation with fewer tokens.", 387 "evidence": "Table 5: TyFlow-220M achieves higher pass@10 (46.55% vs 36-40% on SuFu) with 45-48% fewer tokens (410 vs 752 on SuFu).", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Type pruning is the most impactful component, eliminating all compilation errors and boosting pass@1 by 13.79%.", 392 "evidence": "Table 3 ablation study on SuFu: adding type pruning drops CER from 72.13% to 0.00% and raises pass@1 from 27.59% to 37.93%.", 393 "supported": "strong" 394 } 395 ], 396 "methodology_tags": ["benchmark-eval"], 397 "key_findings": "TyFlow introduces a novel synthesis system that unifies type reasoning with code generation by constructing existential type correctness proofs. Evaluated on SuFu (functional language) and Java, TyFlow eliminates type errors on SuFu (CER 0.00%) and reduces them significantly on Java (CER ~3%), while improving functional correctness (pass@10) by 7-14 percentage points over baseline models. The approach outperforms both rejection sampling and separated type-code generation approaches while using fewer tokens.", 398 "red_flags": [ 399 { 400 "flag": "No statistical rigor", 401 "detail": "All comparisons are based on point estimates with no confidence intervals, significance tests, variance across runs, or even stated number of runs. With small test sets (58 and ~61 tasks), the observed differences could be due to chance." 402 }, 403 { 404 "flag": "Small test sets", 405 "detail": "SuFu test set has only ~58 tasks (20% of 290) and MBJP test set has ~61 tasks (10% of 608). These are very small for drawing reliable conclusions about performance differences." 406 }, 407 { 408 "flag": "Confounded fine-tuning", 409 "detail": "Sec. 6.1.4 states TyFlow models receive 'an initial fine-tuning phase using a supervised fine-tuning dataset from OpenCoder' that baselines do not receive. This additional fine-tuning is a confound not addressed in the evaluation." 410 }, 411 { 412 "flag": "No limitations section", 413 "detail": "The paper has no dedicated limitations, threats to validity, or scope boundaries section despite making strong claims about the approach's effectiveness." 414 }, 415 { 416 "flag": "Checkpoint selection potentially on test data", 417 "detail": "'The best-performing checkpoint is used for evaluation' but no validation set is mentioned. If the best checkpoint was selected based on test performance, results are optimistically biased." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Program Synthesis with Large Language Models", 423 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"], 424 "year": 2021, 425 "arxiv_id": "2108.07732", 426 "relevance": "Foundational work on LLM-based program synthesis." 427 }, 428 { 429 "title": "Evaluating Large Language Models Trained on Code", 430 "authors": ["Mark Chen", "Jerry Tworek"], 431 "year": 2021, 432 "arxiv_id": "2107.03374", 433 "relevance": "Introduces HumanEval/Codex and pass@k metrics used in code generation evaluation." 434 }, 435 { 436 "title": "What's Wrong with Your Code Generated by Large Language Models? An Extensive Study", 437 "authors": ["Shihan Dou"], 438 "year": 2024, 439 "arxiv_id": "2407.06153", 440 "relevance": "Studies errors in LLM-generated code, finding type errors account for 33.6% of failures." 441 }, 442 { 443 "title": "Type-Constrained Code Generation with Language Models", 444 "authors": ["Niels Mündler", "Jingxuan He", "Hao Wang"], 445 "year": 2025, 446 "arxiv_id": "2504.09246", 447 "relevance": "Contemporary work on type-constrained decoding for code generation." 448 }, 449 { 450 "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions", 451 "authors": ["Nhan Nguyen", "Sarah Nadi"], 452 "year": 2022, 453 "doi": "10.1145/3524842.3528470", 454 "relevance": "Empirical evaluation of Copilot showing 24% compilation errors." 455 }, 456 { 457 "title": "Grammar-Aligned Decoding", 458 "authors": ["Kanghee Park", "Jiayu Wang"], 459 "year": 2024, 460 "relevance": "Shows constrained decoding can distort LM output distribution, motivating TyFlow's approach." 461 }, 462 { 463 "title": "GrammarT5: Grammar-Integrated Pretrained Encoder-Decoder Neural Model for Code", 464 "authors": ["Qihao Zhu", "Qingyuan Liang"], 465 "year": 2024, 466 "doi": "10.1145/3597503.3639125", 467 "relevance": "Prior work on grammar-based code representation for LMs, extended by TyFlow to type systems." 468 }, 469 { 470 "title": "Bugs in large language models generated code: an empirical study", 471 "authors": ["Florian Tambon"], 472 "year": 2025, 473 "doi": "10.1007/S10664-025-10614-4", 474 "relevance": "Empirical study of bugs in LLM-generated code." 475 }, 476 { 477 "title": "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair", 478 "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"], 479 "year": 2023, 480 "doi": "10.1145/3611643.3616271", 481 "relevance": "Constrained decoding approach for code generation and repair." 482 }, 483 { 484 "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation", 485 "authors": ["Yue Wang", "Weishi Wang"], 486 "year": 2021, 487 "doi": "10.18653/V1/2021.EMNLP-MAIN.685", 488 "relevance": "Base model used in TyFlow evaluation." 489 }, 490 { 491 "title": "Tare: Type-Aware Neural Program Repair", 492 "authors": ["Qihao Zhu", "Zeyu Sun"], 493 "year": 2023, 494 "doi": "10.1109/ICSE48619.2023.00126", 495 "relevance": "Neural approach to learning Java's type system for program repair." 496 }, 497 { 498 "title": "SynCode: LLM Generation with Grammar Augmentation", 499 "authors": ["Shubham Ugare", "Tarun Suresh"], 500 "year": 2025, 501 "relevance": "Constrained decoding for syntactic correctness in LLM code generation." 502 } 503 ] 504 }