scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24081B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Cross-Language Code Translation via Task-Specific Embedding Alignment in Retrieval-Augmented Generation",
      6     "authors": [
      7       "Manish Bhattarai",
      8       "Minh N. Vu",
      9       "Javier E. Santos",
     10       "Ismael Boureima",
     11       "Daniel O'Malley"
     12     ],
     13     "year": 2025,
     14     "venue": "KnowledgeNLP'25",
     15     "arxiv_id": null,
     16     "doi": "10.18653/v1/2025.knowledgenlp-1.8"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims (14-15% improvements, enhanced retrieval and generation) are directly supported by experimental results showing CodeBLEU gains from 0.64→0.73 and 0.52→0.60.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Paired comparisons between aligned and unaligned embeddings with controlled variables (same LM, datasets, only embedding model varies) support causal claims that alignment improves translation quality.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Results are bounded to Fortran-to-C++ translation on two specific datasets. While the title is broad, experimental scope is clearly delimited to this language pair.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Limitations section discusses CodeBLEU issues but does not explore alternative explanations for improvements (e.g., whether gains stem from better retrieval in general vs. task-specific alignment specifically).",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Paper acknowledges CodeBLEU is a proxy (does not capture functional correctness), with limitations section noting 'may not always translate into functional equivalence.' Functional evaluation mentioned as future work.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Dedicated Section 6 'Limitations' provides substantial discussion of methodological constraints.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats identified: CodeBLEU doesn't capture functional equivalence, InfoNCE loss focus on linguistic similarity, granularity limitations of CodeBLEU, dependence on generated data quality, noise in training data.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Scope boundaries are not explicitly stated. Paper focuses on Fortran-C++ but does not explicitly say results may not generalize to other language pairs or problem types.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgements clearly state funding from 'LANL ASC grant AI4Coding and the LANL Institutional Computing Program, supported by the U.S. DOE NNSA.'",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors listed with Los Alamos National Laboratory affiliations. No affiliation with evaluated commercial products.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funding from government research agency (DOE/LANL) with no direct financial stake in commercial deployment of this method.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement provided. No disclosure of patents, equity, or consulting relationships.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "RAG defined with citation (Lewis et al. 2020), CodeBLEU detailed with component breakdown (n-gram, syntax, semantics), S-InfoNCE formally defined with equations, contrastive learning explained in context.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Clearly states two-fold contribution: demonstrating effectiveness of contrastive learning for retrieval alignment in code translation, and showing optimizing retrieval yields state-of-the-art results without LLM fine-tuning.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages with rule-based translation, fine-tuning approaches, alignment techniques, and RAG. Shows how this work differs by optimizing retrieval without fine-tuning the LLM.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository, GitHub link, or promise of future release provided.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Evaluation uses standard public benchmarks (HPC Fortran2C++ dataset, Numerical Recipes, Stack-V2). Training data and synthetic translations not released.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Training details provided (Adam, learning rate, batch size, temperature) but no requirements.txt, Dockerfile, or complete dependency list. No Python version specified.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Methods section describes approach but lacks step-by-step reproduction instructions. No code or scripts provided. Data preprocessing and model training would require reverse-engineering from text.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Figure 2 reports means with standard deviations (0.73±0.17 aligned vs 0.64±0.19 unaligned). Figure 3 shows box plots with quartiles.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests (t-tests, p-values) reported despite comparative claims. Only descriptive statistics provided.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute improvements (0.64→0.73, 0.52→0.60) and relative improvements (14%, 15%) explicitly reported in abstract and results.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "HPC (315 pairs), Numerical Recipes (298 pairs), Stack-V2 (25,000 sampled). No power analysis or justification for these choices provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Standard deviations reported in Figure 2 captions and box plots in Figure 3 show distribution variance across conditions.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Unaligned StarCoder embeddings serve as baseline. Compared in Figures 2-3 and Table 1.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "StarCoder (2023) is contemporary. LLaMA 3.1 (2024) and Mistral models are state-of-the-art.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "Only aligned vs unaligned comparison. No ablation on S-InfoNCE loss components, temperature sensitivity, or number of retrieved examples (k).",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "CodeBLEU is the only quantitative metric for main results. Appendix A mentions 'small-scale manual check' but minimal functional evaluation provided.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Appendix A provides only cursory human check ('majority compiled and produced expected outputs'). No rigorous human evaluation of translation quality.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "HPC and Numerical Recipes used as held-out test sets. Training on separate Stack-V2 synthetic data with no stated overlap.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results broken down by model size (8B vs 70B), dataset (HPC vs Numerical Recipes), and shot count (0-3 shots).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Figure 2 scatter plots show points where aligned underperforms unaligned, but these failures are not analyzed or discussed.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "All reported results show aligned > unaligned. Figure 2 contains some points below the diagonal (aligned worse) but are not discussed.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "LLaMA 3.1-8B/70B specified by version. Mistral lacks version number (minor issue). StarCoder specified with 125M parameters.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "No actual prompts or system instructions provided. Appendix A shows code examples but not the prompts used for generation.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Learning rate (10^-3), batch size (128), temperature (0.1), early stopping (epoch 20) reported. Retrieve count k shown in shot experiments.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "RAG framework described: retrieve top-k examples, condition LLM on retrieved pairs. Few-shot settings (1-3 shots) used.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Stack-V2 filtering (>500 bytes, prioritize by stars/forks) documented. Extraction of executable Fortran code from metadata-rich files described.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "All evaluation datasets are public (Stack-V2, HPC Fortran2C++, Numerical Recipes). Synthetic C++ translations not released.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Stack-V2 filtering criteria stated. Synthetic generation process described: Fortran→LLaMA→C++ translations. Evaluation datasets used as-is from public sources.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Not applicable.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Training pipeline clear: Stack-V2→extract→generate→CodeBLEU→S-InfoNCE training. Evaluation pipeline: benchmarks→retrieve→generate→CodeBLEU.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "LLaMA 3.1 training cutoff not explicitly stated in paper. Standard knowledge suggests early 2024 cutoff, but not verified in text.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Stack-V2 (training) and HPC/Numerical Recipes (evaluation) noted as separate, but no analysis of whether test benchmarks appeared in Stack-V2 or LLaMA training.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "HPC Fortran2C++ (2023) and Numerical Recipes (1988) are public benchmarks likely in LLaMA 3.1 training data. No discussion of potential contamination.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants. Not applicable.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human subjects. Not applicable.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants. Not applicable.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants. Not applicable.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants. Not applicable.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants. Not applicable.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants. Not applicable.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "Training cost detailed (256 GH200 GPUs, 5 hours total) but inference cost/latency not reported. Computational cost for practitioners unclear.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Training hardware (256 GH200 GPUs, 20 epochs) and time (15 min per epoch) stated. No monetary cost estimated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Task-specific embedding alignment significantly improves Fortran-to-C++ code translation quality measured by CodeBLEU",
    375       "evidence": "Figure 2 scatter plots and Table 1 show consistent improvements: 0.64→0.73 (14% relative) on HPC Fortran2C++, 0.52→0.60 (15% relative) on Numerical Recipes, across all four language models tested.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "S-InfoNCE loss successfully learns embeddings where semantically similar code (by CodeBLEU) is positioned closer in embedding space",
    380       "evidence": "Lemma 1 provides theoretical characterization of stationary points; Figure 2 empirically validates that aligned embeddings retrieve examples producing higher-quality translations.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Aligned embeddings provide larger benefits in few-shot prompting settings than unaligned embeddings",
    385       "evidence": "Table 1 shows aligned model improvements exceed unaligned in few-shot: e.g., aligned +0.346 vs unaligned +0.262 for 1-shot on HPC with LLaMA 70B.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Larger language models (70B parameters) outperform smaller models (8B) for code translation",
    390       "evidence": "Consistent pattern across Figures 2-3 and Table 1: LLaMA 3.1-70B achieves higher CodeBLEU scores than LLaMA 3.1-8B in all configurations.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Code translation performance gains plateau after 2-3 retrieved examples (diminishing marginal returns on shots)",
    395       "evidence": "Table 1 shows improvement deltas: 1→2 shots (+0.009 to +0.033), 2→3 shots (+0.006 to +0.015). Conclusion states 'majority of gains realized with just one or two examples.'",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "This approach achieves improvements without fine-tuning the underlying large language model",
    400       "evidence": "Abstract and methods explicitly state using fixed LLaMA/Mistral/Mixtral models; only StarCoder embedding model is trained via contrastive learning.",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "empirical"
    407   ],
    408   "key_findings": "This paper proposes aligning code embeddings to task-specific objectives (CodeBLEU scores) via contrastive learning (S-InfoNCE loss) within a retrieval-augmented generation framework for Fortran-to-C++ translation. Aligned embeddings consistently outperform unaligned baselines across multiple models and datasets (14-15% relative improvements), deliver larger gains in few-shot settings, and achieve these benefits without requiring expensive language model fine-tuning. Most translation improvements plateau after retrieving 2-3 examples.",
    409   "red_flags": [
    410     {
    411       "flag": "Functional equivalence not verified",
    412       "detail": "CodeBLEU evaluates syntactic/semantic similarity but not functional correctness. Appendix A's 'small-scale manual check' is minimal (just compilation + execution), insufficient for translation quality assurance."
    413     },
    414     {
    415       "flag": "Benchmark contamination unaddressed",
    416       "detail": "HPC Fortran2C++ and Numerical Recipes are public benchmarks likely present in LLaMA 3.1's training data. No analysis of train-test overlap or discussion of potential data contamination."
    417     },
    418     {
    419       "flag": "Limited baseline comparisons",
    420       "detail": "Only StarCoder embedding model tested with/without alignment. Related work mentions Nomic-Embed and CodeBERT but no empirical comparison to these alternative embeddings."
    421     },
    422     {
    423       "flag": "Failure cases not analyzed",
    424       "detail": "Figure 2 scatter plots show points where aligned underperforms unaligned, but these cases are not discussed or investigated."
    425     },
    426     {
    427       "flag": "Synthetic training data quality unexplored",
    428       "detail": "25,000 C++ translations generated by LLaMA 3.1-8B without verification. Noise in automatically-extracted and LLM-generated training data may degrade alignment quality."
    429     },
    430     {
    431       "flag": "Non-reproducible prompting",
    432       "detail": "No actual prompts or system instructions provided. Exact few-shot formatting and prompt construction cannot be replicated."
    433     },
    434     {
    435       "flag": "Code and model artifacts not released",
    436       "detail": "Neither the aligned StarCoder embedding checkpoint nor training/evaluation scripts are publicly available, blocking independent verification."
    437     },
    438     {
    439       "flag": "No ablation studies",
    440       "detail": "No ablation on S-InfoNCE loss components, temperature parameter sensitivity, or optimal retrieval count (k). Claims about alignment effectiveness lack component-level evidence."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    446       "relevance": "Foundational RAG framework that this work builds upon."
    447     },
    448     {
    449       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    450       "relevance": "Influential code embedding model; related work discusses as alternative to StarCoder."
    451     },
    452     {
    453       "title": "Evaluating large language models trained on code (Codex)",
    454       "relevance": "Seminal work on LLM code capabilities; establishes baseline for code translation."
    455     },
    456     {
    457       "title": "CodeBLEU: a method for automatic evaluation of code synthesis",
    458       "relevance": "Core evaluation metric used for training alignment and measuring translation quality."
    459     },
    460     {
    461       "title": "Creating a dataset for high-performance computing code translation using LLMs",
    462       "relevance": "Source of HPC Fortran2C++ evaluation benchmark."
    463     },
    464     {
    465       "title": "StarCoder 2 and the Stack v2: the next generation",
    466       "relevance": "Provides Stack-V2 training corpus and StarCoder embedding model."
    467     },
    468     {
    469       "title": "StarCoder: may the source be with you!",
    470       "relevance": "StarCoder model used as embedding backbone for retrieval alignment."
    471     },
    472     {
    473       "title": "Llama: Open and efficient foundation language models",
    474       "relevance": "LLaMA models (8B, 70B) used for evaluation and synthetic data generation."
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 2,
    480       "justification": "Method avoids fine-tuning (practical) but training requires 256 GH200 GPUs, limiting accessibility. Applicability bounded to Fortran-C++ unless extended to other language pairs."
    481     },
    482     "surprise_contrarian": {
    483       "score": 1,
    484       "justification": "Task-specific retrieval alignment in RAG is conceptually straightforward; contribution is incremental optimization of a known approach rather than novel insight."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No safety, security, or alignment concerns raised or addressed. Purely a code translation engineering problem."
    489     },
    490     "drama_conflict": {
    491       "score": 0,
    492       "justification": "No controversy, competing frameworks, or adversarial framing. Straightforward technical contribution."
    493     },
    494     "demo_ability": {
    495       "score": 1,
    496       "justification": "Could demo on small scale (inference is lightweight) but full training requires massive GPU resources. No public model checkpoint or demo provided."
    497     },
    498     "brand_recognition": {
    499       "score": 2,
    500       "justification": "Authors from respectable institution (Los Alamos National Lab), uses well-known models (LLaMA, Mixtral), but published in workshop (KnowledgeNLP'25) rather than top-tier venue."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [],
    505     "top_points": 0,
    506     "total_points": 0,
    507     "total_comments": 0
    508   }
    509 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs