scan.json (25959B)
1 { 2 "paper": { 3 "title": "TFHE-Coder: Evaluating LLM-agentic Fully Homomorphic Encryption Code Generation", 4 "authors": ["Mayank Kumar", "Jiaqi Xue", "Mengxin Zheng", "Qian Lou"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2503.12217", 8 "doi": "10.48550/arXiv.2503.12217" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "GPT-4o significantly outperforms open-source models (CodeLlama, Qwen2.5-Coder, Deepseek-Coder) in generating compilable and functionally correct TFHE code, with open-source models achieving zero functional correctness in the baseline setting. Few-shot prompting is substantially more effective than RAG alone, and combining both yields the best results. ReLU remains the most challenging task across all models and techniques, while simpler logic gates (NOT, AND, OR) are more tractable. The paper establishes the first benchmark for TFHE code generation, though it is limited to only four tasks.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, code archive, or link to the evaluation framework is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No benchmark dataset, task specifications, reference implementations, or generated outputs are released. The 4 tasks (NOT, AND, OR, ReLU) are described but the actual prompts and reference code are not publicly available." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "Section III.B lists hardware (i5-12600K, 64GB RAM, RTX 3090, Ubuntu 24.04, Python 3.8, CUDA 12.2) and the embedding model (jina-embeddings-v2-base-code), but there is no requirements.txt, dependency list, TFHE library version, or sufficient detail to recreate the software environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions, README, or scripts to replicate experiments are provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Despite averaging over 5 runs, no confidence intervals or error bars are reported. Figures 3, 5, 6, and 7 show only point estimates." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "Multiple comparative claims are made (e.g., 'GPT-4o consistently outperforms', 'few-shot prompting significantly improves correctness') without any statistical significance tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Specific improvements with baseline context are reported, e.g., 'Deepseek-Coder on AND improves from 0.12 to 0.83 (CrystalBLEU) and from 0.0 to 1.0 (Pass@k).' Absolute scores are given for all models across all conditions." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper uses nt=5 repeated experiments per condition with no justification for why 5 was chosen or whether this is sufficient for reliable estimates." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper states 'five repeated experiments and averaging the results' but never reports standard deviation, variance, or any spread measure across those runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "A clear baseline (compiler-in-the-loop without RAG or few-shot) is established, against which RAG, few-shot, and RAG+few-shot are compared." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Models evaluated include GPT-4o, Claude-3.5-Haiku, Qwen2.5-Coder, and Deepseek-Coder, which are recent and representative of both closed and open-source LLMs." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper systematically tests baseline, RAG alone, few-shot alone, and RAG+few-shot, effectively ablating the contribution of each component." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: CrystalBLEU (structural similarity), Pass@k (comp) (compilability), Pass@k (func) (functional correctness), Wrong Format Error, and Repetition Error." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of generated code quality. All evaluation is automated via compilation and test case execution." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "The few-shot example uses a correct OR gate implementation, yet OR is also one of the four test tasks. There is no separation between the example used for prompting and the tasks evaluated." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per task (NOT, AND, OR, ReLU) and per model across all metrics in Figures 3-7." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section IV (RQ2) provides detailed failure analysis: wrong format errors, repetition errors, API misuse, hallucinated function calls, and copy-paste behavior. Specific failure patterns are categorized." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "RAG alone is reported as offering 'limited gains' with 'no significant increase in functional correctness.' Open-source models' failures are extensively documented. ReLU remains unsolved for most models." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about 'significant limitations in off-the-shelf models' and 'agentic optimizations reduce errors and enhance code fidelity' are supported by the experimental results in Section IV." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims like 'few-shot prompting significantly improves correctness' are supported by controlled single-variable comparisons (baseline vs. +RAG, vs. +few-shot, vs. +both). The ablation design provides adequate causal evidence." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract specifies 'focusing on logic gates and ReLU activation' and the paper consistently frames findings around these specific tasks. The title is specific to TFHE code generation." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations are discussed. The paper does not consider confounds such as model size differences (7B vs. unknown for GPT-4o/Claude), training data composition, or whether the performance gap is due to TFHE-specific vs. general coding ability." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly discusses CrystalBLEU as a proxy: 'higher similarity often indicates that less effort is required to correct and functionalize the output.' The metrics (compilability, functional correctness, structural similarity) are clearly distinguished from broader claims about TFHE usability." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are listed as 'GPT-4o', 'Claude-3.5-Haiku', 'CodeLlama-7B-Instruct', 'Qwen2.5-Coder-7B-Instruct', 'deepseek-coder-6.7b-instruct'. GPT-4o and Claude-3.5-Haiku lack snapshot dates or API versions." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Prompts are described only in natural language: 'A user prompt (consisting of task description and reference C code)' and 'a correct implementation of an OR gate using TFHE's bootsOR function.' The actual prompt text is never provided." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section III.B states: 'we set the temperature to 0.9 and top-p to 0.85' and specifies the iteration limit of 10." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The compiler-in-the-loop workflow (Fig. 1) and agentic-optimized workflow (Fig. 2) are described in detail, including the feedback loop, RAG integration, and few-shot prompting mechanism." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The task space is clearly defined (NOT, AND, OR, ReLU), the evaluation pipeline is documented (generate → compile → feedback → measure), and the reference implementation structure is described." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "Section V is titled 'Discussion and Future Work' but contains only a brief paragraph restating results and suggesting future directions. There is no dedicated limitations section with substantive discussion of study limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity are discussed. The discussion section mentions that 'additional fine-tuning or domain-specific training may be necessary' but does not address threats to the validity of the current study's conclusions." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what its results do NOT show. It does not discuss limitations of evaluating on only 4 tasks, only gate-level operations, or the non-representativeness of the task set for general TFHE code generation." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data (generated code samples, compilation logs, per-run results) is made available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section III.B describes the task design rationale, model selection, experimental setup, evaluation metrics, and the iterative generation/compilation workflow." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are involved. The study evaluates LLMs on constructed coding tasks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from prompt to generation to compilation to feedback to evaluation is documented across Sections III.B and III.C with accompanying workflow diagrams (Fig. 1 and Fig. 2)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding acknowledgment or funding disclosure is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are listed as affiliated with the University of Central Florida. They are not evaluating their own commercial product." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information is disclosed, so independence of funder cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the five evaluated models. This is important since TFHE library code and documentation may be in training data." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether TFHE code examples, documentation, or the reference implementations appear in the models' training data." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "TFHE documentation and examples are publicly available and likely in training data for models trained after 2020. The paper does not address this contamination risk despite it being central to interpreting the results." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "The Appendix reports token consumption for each model (e.g., GPT-4o: 65,829 input tokens, 1,784 output tokens) across different experimental conditions." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Hardware is specified (RTX 3090, i5-12600K) but total compute budget (wall-clock time, total API spend, GPU hours) is not reported." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Results are averaged over 5 runs but no variance, standard deviation, or seed sensitivity analysis is reported." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section III.B states: 'five repeated experiments and averaging the results' and 'nt = 5 and k = 1.'" 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Temperature 0.9 and top-p 0.85 are used but no justification for these values or search budget is provided." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "All four configurations (baseline, RAG, few-shot, RAG+few-shot) are reported across all models and tasks. The 'best' configuration is determined transparently from the full results." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors propose the evaluation framework and evaluate it without acknowledging potential bias in designing the evaluation to favor their approach." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Token consumption is reported in the Appendix but performance is not plotted as a function of compute. Models with vastly different sizes (7B vs. unknown for GPT-4o) are compared without discussing compute parity." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether 4 tasks (NOT, AND, OR, ReLU) are representative of TFHE code generation more broadly, or whether these tasks have adequate construct validity for the claims made." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": true, 336 "justification": "All models are evaluated within the same compiler-in-the-loop framework, keeping the scaffold constant across model comparisons." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether TFHE library code, documentation, or example implementations existed in model training data before the evaluation." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The prompt includes reference C code implementations, which provides structural guidance that would not be available in realistic usage. This is acknowledged as part of the design but the implications for generalizability are not discussed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "The four tasks are structurally similar gate-level TFHE operations. The non-independence of these tasks (success on one may predict success on others) is not discussed." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "GPT-4o consistently outperforms open-source models across all TFHE code generation tasks in the baseline setting.", 365 "evidence": "Figure 3 shows GPT-4o achieving the highest Pass@k (comp) and Pass@k (func) scores across NOT, AND, OR, and ReLU, while open-source models achieve zero functional correctness (Section IV, RQ1).", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Few-shot prompting is more effective than RAG alone for improving TFHE code generation.", 370 "evidence": "Comparing Fig. 5 (RAG) and Fig. 6 (few-shot): few-shot shows substantial improvements across models (e.g., Deepseek-Coder AND: 0.12→0.83 CrystalBLEU, 0.0→1.0 Pass@k), while RAG shows minimal gains (Section IV, RQ3).", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Combining RAG and few-shot prompting yields the strongest overall improvements.", 375 "evidence": "Fig. 7 and Fig. 4 show RAG+few-shot achieves the highest CrystalBLEU scores, lowest error rates (Wrong Format, Repetition), and near-perfect correctness for GPT-4o and Claude-3.5-Haiku (Section IV, RQ3).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Open-source models fail to generate functionally correct TFHE code in the baseline setting.", 380 "evidence": "Fig. 3(c) shows Pass@k (func) scores of zero for CodeLlama, Qwen2.5-Coder, and Deepseek-Coder across all four tasks (Section IV, RQ1).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "ReLU is the most challenging task for TFHE code generation across all models and techniques.", 385 "evidence": "Across all figures (3-7), ReLU consistently shows the lowest CrystalBLEU and Pass@k scores, with only GPT-4o achieving functional correctness. The paper attributes this to 'arithmetic complexity' (Section IV).", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Tiny task set", 392 "detail": "The entire evaluation is based on only 4 tasks (NOT, AND, OR, ReLU). This is far too few to draw robust conclusions about TFHE code generation capability. The paper claims to establish 'the first benchmark for TFHE code generation' based on these 4 tasks." 393 }, 394 { 395 "flag": "No variance reported despite multiple runs", 396 "detail": "The paper averages over 5 runs but never reports standard deviation or any spread measure. With only 5 runs on 4 tasks, individual results could vary substantially, and the reader has no way to assess result stability." 397 }, 398 { 399 "flag": "Few-shot example overlaps with test set", 400 "detail": "The few-shot prompt uses a correct OR gate implementation as the example, and OR is also one of the four evaluated tasks. This conflates the 'training' signal with the test evaluation." 401 }, 402 { 403 "flag": "No statistical tests for comparative claims", 404 "detail": "The paper makes many comparative claims ('outperforms', 'significantly improves') without any statistical testing, relying solely on visual comparison of point estimates from 5 averaged runs." 405 }, 406 { 407 "flag": "Unfair model size comparison", 408 "detail": "Open-source models are all ~7B parameters while GPT-4o and Claude-3.5-Haiku are likely much larger. The performance gap may reflect model scale rather than anything specific to TFHE code generation, but this confound is not discussed." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "A survey on large language models for code generation", 414 "authors": ["J. Jiang", "F. Wang", "J. Shen", "S. Kim", "S. Kim"], 415 "year": 2024, 416 "arxiv_id": "2406.00515", 417 "relevance": "Survey of LLM code generation capabilities, directly relevant to understanding the state of the field this paper evaluates." 418 }, 419 { 420 "title": "Evaluating large language models trained on code", 421 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 422 "year": 2021, 423 "arxiv_id": "2107.03374", 424 "relevance": "Introduces Codex and the HumanEval benchmark, foundational work for LLM code generation evaluation." 425 }, 426 { 427 "title": "CodeGen: An open large language model for code with multi-turn program synthesis", 428 "authors": ["E. Nijkamp", "B. Pang", "H. Hayashi"], 429 "year": 2022, 430 "arxiv_id": "2203.13474", 431 "relevance": "Open-source code generation model with multi-turn synthesis, relevant to agentic code generation." 432 }, 433 { 434 "title": "Code llama: Open foundation models for code", 435 "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"], 436 "year": 2023, 437 "arxiv_id": "2308.12950", 438 "relevance": "One of the evaluated open-source code models; foundational LLM for code." 439 }, 440 { 441 "title": "Qwen2.5-coder technical report", 442 "authors": ["B. Hui", "J. Yang", "Z. Cui"], 443 "year": 2024, 444 "arxiv_id": "2409.12186", 445 "relevance": "Technical report for one of the evaluated open-source code models." 446 }, 447 { 448 "title": "Deepseek-coder: When the large language model meets programming", 449 "authors": ["D. Guo", "Q. Zhu", "D. Yang"], 450 "year": 2024, 451 "arxiv_id": "2401.14196", 452 "relevance": "One of the evaluated open-source code models for domain-specific code generation." 453 }, 454 { 455 "title": "GPT-4o system card", 456 "authors": ["A. Hurst", "A. Lerer", "A. P. Goucher"], 457 "year": 2024, 458 "arxiv_id": "2410.21276", 459 "relevance": "System card for the top-performing model in this evaluation." 460 }, 461 { 462 "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks", 463 "authors": ["P. Lewis", "E. Perez", "A. Piktus"], 464 "year": 2020, 465 "relevance": "Foundational RAG paper, relevant to agentic code generation augmentation techniques evaluated in this study." 466 }, 467 { 468 "title": "Language models are few-shot learners", 469 "authors": ["T. Brown", "B. Mann", "N. Ryder"], 470 "year": 2020, 471 "relevance": "Introduces few-shot prompting paradigm used as a key technique in this evaluation." 472 }, 473 { 474 "title": "A survey on LLM-based code generation for low-resource and domain-specific programming languages", 475 "authors": ["S. Joel", "J. J. Wu", "F. H. Fard"], 476 "year": 2024, 477 "arxiv_id": "2410.03981", 478 "relevance": "Directly addresses domain-specific code generation challenges that this paper investigates for TFHE." 479 }, 480 { 481 "title": "On the robustness of code generation techniques: An empirical study on GitHub Copilot", 482 "authors": ["A. Mastropaolo", "L. Pascarella", "E. Guglielmi"], 483 "year": 2023, 484 "relevance": "Empirical evaluation of code generation robustness, relevant to understanding LLM code generation reliability." 485 }, 486 { 487 "title": "Benchmarking large language models for automated verilog RTL code generation", 488 "authors": ["S. Thakur", "B. Ahmad", "Z. Fan"], 489 "year": 2023, 490 "relevance": "Benchmarks LLMs for domain-specific hardware code generation, closely parallel methodology to TFHE code generation evaluation." 491 } 492 ] 493 }