scan-v5.json (24081B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Enhancing Cross-Language Code Translation via Task-Specific Embedding Alignment in Retrieval-Augmented Generation", 6 "authors": [ 7 "Manish Bhattarai", 8 "Minh N. Vu", 9 "Javier E. Santos", 10 "Ismael Boureima", 11 "Daniel O'Malley" 12 ], 13 "year": 2025, 14 "venue": "KnowledgeNLP'25", 15 "arxiv_id": null, 16 "doi": "10.18653/v1/2025.knowledgenlp-1.8" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims (14-15% improvements, enhanced retrieval and generation) are directly supported by experimental results showing CodeBLEU gains from 0.64→0.73 and 0.52→0.60.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Paired comparisons between aligned and unaligned embeddings with controlled variables (same LM, datasets, only embedding model varies) support causal claims that alignment improves translation quality.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "Results are bounded to Fortran-to-C++ translation on two specific datasets. While the title is broad, experimental scope is clearly delimited to this language pair.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "Limitations section discusses CodeBLEU issues but does not explore alternative explanations for improvements (e.g., whether gains stem from better retrieval in general vs. task-specific alignment specifically).", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "Paper acknowledges CodeBLEU is a proxy (does not capture functional correctness), with limitations section noting 'may not always translate into functional equivalence.' Functional evaluation mentioned as future work.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Dedicated Section 6 'Limitations' provides substantial discussion of methodological constraints.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats identified: CodeBLEU doesn't capture functional equivalence, InfoNCE loss focus on linguistic similarity, granularity limitations of CodeBLEU, dependence on generated data quality, noise in training data.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "Scope boundaries are not explicitly stated. Paper focuses on Fortran-C++ but does not explicitly say results may not generalize to other language pairs or problem types.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgements clearly state funding from 'LANL ASC grant AI4Coding and the LANL Institutional Computing Program, supported by the U.S. DOE NNSA.'", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All authors listed with Los Alamos National Laboratory affiliations. No affiliation with evaluated commercial products.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "Funding from government research agency (DOE/LANL) with no direct financial stake in commercial deployment of this method.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement provided. No disclosure of patents, equity, or consulting relationships.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "RAG defined with citation (Lewis et al. 2020), CodeBLEU detailed with component breakdown (n-gram, syntax, semantics), S-InfoNCE formally defined with equations, contrastive learning explained in context.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Clearly states two-fold contribution: demonstrating effectiveness of contrastive learning for retrieval alignment in code translation, and showing optimizing retrieval yields state-of-the-art results without LLM fine-tuning.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2 engages with rule-based translation, fine-tuning approaches, alignment techniques, and RAG. Shows how this work differs by optimizing retrieval without fine-tuning the LLM.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No code repository, GitHub link, or promise of future release provided.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "Evaluation uses standard public benchmarks (HPC Fortran2C++ dataset, Numerical Recipes, Stack-V2). Training data and synthetic translations not released.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Training details provided (Adam, learning rate, batch size, temperature) but no requirements.txt, Dockerfile, or complete dependency list. No Python version specified.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "Methods section describes approach but lacks step-by-step reproduction instructions. No code or scripts provided. Data preprocessing and model training would require reverse-engineering from text.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": true, 150 "justification": "Figure 2 reports means with standard deviations (0.73±0.17 aligned vs 0.64±0.19 unaligned). Figure 3 shows box plots with quartiles.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests (t-tests, p-values) reported despite comparative claims. Only descriptive statistics provided.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Absolute improvements (0.64→0.73, 0.52→0.60) and relative improvements (14%, 15%) explicitly reported in abstract and results.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "HPC (315 pairs), Numerical Recipes (298 pairs), Stack-V2 (25,000 sampled). No power analysis or justification for these choices provided.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": true, 174 "justification": "Standard deviations reported in Figure 2 captions and box plots in Figure 3 show distribution variance across conditions.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Unaligned StarCoder embeddings serve as baseline. Compared in Figures 2-3 and Table 1.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "StarCoder (2023) is contemporary. LLaMA 3.1 (2024) and Mistral models are state-of-the-art.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": false, 194 "justification": "Only aligned vs unaligned comparison. No ablation on S-InfoNCE loss components, temperature sensitivity, or number of retrieved examples (k).", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": false, 200 "justification": "CodeBLEU is the only quantitative metric for main results. Appendix A mentions 'small-scale manual check' but minimal functional evaluation provided.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "Appendix A provides only cursory human check ('majority compiled and produced expected outputs'). No rigorous human evaluation of translation quality.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "HPC and Numerical Recipes used as held-out test sets. Training on separate Stack-V2 synthetic data with no stated overlap.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results broken down by model size (8B vs 70B), dataset (HPC vs Numerical Recipes), and shot count (0-3 shots).", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "Figure 2 scatter plots show points where aligned underperforms unaligned, but these failures are not analyzed or discussed.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": false, 230 "justification": "All reported results show aligned > unaligned. Figure 2 contains some points below the diagonal (aligned worse) but are not discussed.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "LLaMA 3.1-8B/70B specified by version. Mistral lacks version number (minor issue). StarCoder specified with 125M parameters.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "No actual prompts or system instructions provided. Appendix A shows code examples but not the prompts used for generation.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Learning rate (10^-3), batch size (128), temperature (0.1), early stopping (epoch 20) reported. Retrieve count k shown in shot experiments.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "RAG framework described: retrieve top-k examples, condition LLM on retrieved pairs. Few-shot settings (1-3 shots) used.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Stack-V2 filtering (>500 bytes, prioritize by stars/forks) documented. Extraction of executable Fortran code from metadata-rich files described.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "All evaluation datasets are public (Stack-V2, HPC Fortran2C++, Numerical Recipes). Synthetic C++ translations not released.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Stack-V2 filtering criteria stated. Synthetic generation process described: Fortran→LLaMA→C++ translations. Evaluation datasets used as-is from public sources.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants. Not applicable.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Training pipeline clear: Stack-V2→extract→generate→CodeBLEU→S-InfoNCE training. Evaluation pipeline: benchmarks→retrieve→generate→CodeBLEU.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "LLaMA 3.1 training cutoff not explicitly stated in paper. Standard knowledge suggests early 2024 cutoff, but not verified in text.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "Stack-V2 (training) and HPC/Numerical Recipes (evaluation) noted as separate, but no analysis of whether test benchmarks appeared in Stack-V2 or LLaMA training.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "HPC Fortran2C++ (2023) and Numerical Recipes (1988) are public benchmarks likely in LLaMA 3.1 training data. No discussion of potential contamination.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants. Not applicable.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human subjects. Not applicable.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants. Not applicable.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants. Not applicable.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants. Not applicable.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants. Not applicable.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants. Not applicable.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "Training cost detailed (256 GH200 GPUs, 5 hours total) but inference cost/latency not reported. Computational cost for practitioners unclear.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": true, 366 "justification": "Training hardware (256 GH200 GPUs, 20 epochs) and time (15 min per epoch) stated. No monetary cost estimated.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Task-specific embedding alignment significantly improves Fortran-to-C++ code translation quality measured by CodeBLEU", 375 "evidence": "Figure 2 scatter plots and Table 1 show consistent improvements: 0.64→0.73 (14% relative) on HPC Fortran2C++, 0.52→0.60 (15% relative) on Numerical Recipes, across all four language models tested.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "S-InfoNCE loss successfully learns embeddings where semantically similar code (by CodeBLEU) is positioned closer in embedding space", 380 "evidence": "Lemma 1 provides theoretical characterization of stationary points; Figure 2 empirically validates that aligned embeddings retrieve examples producing higher-quality translations.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Aligned embeddings provide larger benefits in few-shot prompting settings than unaligned embeddings", 385 "evidence": "Table 1 shows aligned model improvements exceed unaligned in few-shot: e.g., aligned +0.346 vs unaligned +0.262 for 1-shot on HPC with LLaMA 70B.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Larger language models (70B parameters) outperform smaller models (8B) for code translation", 390 "evidence": "Consistent pattern across Figures 2-3 and Table 1: LLaMA 3.1-70B achieves higher CodeBLEU scores than LLaMA 3.1-8B in all configurations.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Code translation performance gains plateau after 2-3 retrieved examples (diminishing marginal returns on shots)", 395 "evidence": "Table 1 shows improvement deltas: 1→2 shots (+0.009 to +0.033), 2→3 shots (+0.006 to +0.015). Conclusion states 'majority of gains realized with just one or two examples.'", 396 "supported": "strong" 397 }, 398 { 399 "claim": "This approach achieves improvements without fine-tuning the underlying large language model", 400 "evidence": "Abstract and methods explicitly state using fixed LLaMA/Mistral/Mixtral models; only StarCoder embedding model is trained via contrastive learning.", 401 "supported": "strong" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "empirical" 407 ], 408 "key_findings": "This paper proposes aligning code embeddings to task-specific objectives (CodeBLEU scores) via contrastive learning (S-InfoNCE loss) within a retrieval-augmented generation framework for Fortran-to-C++ translation. Aligned embeddings consistently outperform unaligned baselines across multiple models and datasets (14-15% relative improvements), deliver larger gains in few-shot settings, and achieve these benefits without requiring expensive language model fine-tuning. Most translation improvements plateau after retrieving 2-3 examples.", 409 "red_flags": [ 410 { 411 "flag": "Functional equivalence not verified", 412 "detail": "CodeBLEU evaluates syntactic/semantic similarity but not functional correctness. Appendix A's 'small-scale manual check' is minimal (just compilation + execution), insufficient for translation quality assurance." 413 }, 414 { 415 "flag": "Benchmark contamination unaddressed", 416 "detail": "HPC Fortran2C++ and Numerical Recipes are public benchmarks likely present in LLaMA 3.1's training data. No analysis of train-test overlap or discussion of potential data contamination." 417 }, 418 { 419 "flag": "Limited baseline comparisons", 420 "detail": "Only StarCoder embedding model tested with/without alignment. Related work mentions Nomic-Embed and CodeBERT but no empirical comparison to these alternative embeddings." 421 }, 422 { 423 "flag": "Failure cases not analyzed", 424 "detail": "Figure 2 scatter plots show points where aligned underperforms unaligned, but these cases are not discussed or investigated." 425 }, 426 { 427 "flag": "Synthetic training data quality unexplored", 428 "detail": "25,000 C++ translations generated by LLaMA 3.1-8B without verification. Noise in automatically-extracted and LLM-generated training data may degrade alignment quality." 429 }, 430 { 431 "flag": "Non-reproducible prompting", 432 "detail": "No actual prompts or system instructions provided. Exact few-shot formatting and prompt construction cannot be replicated." 433 }, 434 { 435 "flag": "Code and model artifacts not released", 436 "detail": "Neither the aligned StarCoder embedding checkpoint nor training/evaluation scripts are publicly available, blocking independent verification." 437 }, 438 { 439 "flag": "No ablation studies", 440 "detail": "No ablation on S-InfoNCE loss components, temperature parameter sensitivity, or optimal retrieval count (k). Claims about alignment effectiveness lack component-level evidence." 441 } 442 ], 443 "cited_papers": [ 444 { 445 "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks", 446 "relevance": "Foundational RAG framework that this work builds upon." 447 }, 448 { 449 "title": "CodeBERT: A pre-trained model for programming and natural languages", 450 "relevance": "Influential code embedding model; related work discusses as alternative to StarCoder." 451 }, 452 { 453 "title": "Evaluating large language models trained on code (Codex)", 454 "relevance": "Seminal work on LLM code capabilities; establishes baseline for code translation." 455 }, 456 { 457 "title": "CodeBLEU: a method for automatic evaluation of code synthesis", 458 "relevance": "Core evaluation metric used for training alignment and measuring translation quality." 459 }, 460 { 461 "title": "Creating a dataset for high-performance computing code translation using LLMs", 462 "relevance": "Source of HPC Fortran2C++ evaluation benchmark." 463 }, 464 { 465 "title": "StarCoder 2 and the Stack v2: the next generation", 466 "relevance": "Provides Stack-V2 training corpus and StarCoder embedding model." 467 }, 468 { 469 "title": "StarCoder: may the source be with you!", 470 "relevance": "StarCoder model used as embedding backbone for retrieval alignment." 471 }, 472 { 473 "title": "Llama: Open and efficient foundation language models", 474 "relevance": "LLaMA models (8B, 70B) used for evaluation and synthetic data generation." 475 } 476 ], 477 "engagement_factors": { 478 "practical_relevance": { 479 "score": 2, 480 "justification": "Method avoids fine-tuning (practical) but training requires 256 GH200 GPUs, limiting accessibility. Applicability bounded to Fortran-C++ unless extended to other language pairs." 481 }, 482 "surprise_contrarian": { 483 "score": 1, 484 "justification": "Task-specific retrieval alignment in RAG is conceptually straightforward; contribution is incremental optimization of a known approach rather than novel insight." 485 }, 486 "fear_safety": { 487 "score": 0, 488 "justification": "No safety, security, or alignment concerns raised or addressed. Purely a code translation engineering problem." 489 }, 490 "drama_conflict": { 491 "score": 0, 492 "justification": "No controversy, competing frameworks, or adversarial framing. Straightforward technical contribution." 493 }, 494 "demo_ability": { 495 "score": 1, 496 "justification": "Could demo on small scale (inference is lightweight) but full training requires massive GPU resources. No public model checkpoint or demo provided." 497 }, 498 "brand_recognition": { 499 "score": 2, 500 "justification": "Authors from respectable institution (Los Alamos National Lab), uses well-known models (LLaMA, Mixtral), but published in workshop (KnowledgeNLP'25) rather than top-tier venue." 501 } 502 }, 503 "hn_data": { 504 "threads": [], 505 "top_points": 0, 506 "total_points": 0, 507 "total_comments": 0 508 } 509 }