scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27840B)
      1 {
      2   "paper": {
      3     "title": "Enhancing Cross-Language Code Translation via Task-Specific Embedding Alignment in Retrieval-Augmented Generation",
      4     "authors": [
      5       "Manish Bhattarai",
      6       "Minh Vu",
      7       "Javier E. Santos",
      8       "Ismael Boureima",
      9       "Daniel O'Malley"
     10     ],
     11     "year": 2025,
     12     "venue": "KnowledgeNLP'25 (4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing)",
     13     "doi": "10.18653/v1/2025.knowledgenlp-1.8"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No code repository URL is provided anywhere in the paper. The aligned embedding model and pipeline code are not released."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The evaluation datasets (HPC Fortran2C++ and Numerical Recipes) and the source dataset (Stack-V2) are publicly available. However, the 25K curated Fortran snippets and generated C++ translations used for alignment are not released."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions hardware (256 GH200 GPUs) and model names but provides no software environment specifications, dependency lists, or version details for libraries used."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. A researcher would need to reverse-engineer the pipeline from the method description."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Standard deviations are reported in Figure 2 (e.g., 'Avg Aligned: 0.73 ± 0.17') and box plots in Figure 3 show distributional spread including quartiles."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No statistical significance tests (p-values, t-tests, etc.) are reported despite claims that aligned embeddings 'significantly' improve performance over unaligned. Comparisons are based solely on mean differences."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Relative improvements are reported (14% and 15%) with baseline context (e.g., 'from 0.64 to 0.73'). Table 1 shows delta values across all configurations."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No justification for the evaluation dataset sizes (315 and 298 pairs) or the 25,000 training snippets. No power analysis or discussion of whether these sizes are sufficient for the claims made."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Standard deviations are reported in Figure 2 across shot configurations, and Figure 3 provides box plots showing distributional spread of CodeBLEU scores."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper compares aligned embeddings against unaligned (generic) embeddings using the same RAG framework, and includes zero-shot (no RAG) as an additional baseline in Table 1."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Baselines include StarEncoder (2023), and the comparison setup follows Bhattarai et al. (2024). Models tested include LLaMA 3.1 (2024) and Mixtral (2024)."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The comparison of aligned vs. unaligned embeddings effectively ablates the alignment component. Table 1 shows zero-shot (no retrieval) vs. few-shot (with retrieval) across shot counts, isolating the contribution of both RAG and alignment."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Only CodeBLEU is used as the evaluation metric. Although CodeBLEU is itself a composite of four sub-components, no additional independent metrics (e.g., compilation rate, functional correctness, BLEU, CrystalBLEU) are reported."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The paper mentions 'a small-scale manual check on a subset of translations' (Section 4) but does not report this systematically — no sample size, no inter-rater agreement, no quantitative results. The appendix shows a single qualitative example."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The embedding model is trained on Stack-V2 data, while evaluation is conducted on separate datasets: HPC Fortran2C++ (315 pairs) and Numerical Recipes (298 pairs)."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 1 provides breakdowns by model (4 models), dataset (2 datasets), and shot count (0-3 shots). Figure 2 scatter plots show per-sample variation."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No failure cases are discussed. While scatter plots in Figure 2 show some samples where aligned embeddings perform worse than unaligned, these cases are not analyzed or explained."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "Every experiment shows improvement from alignment. The observation of 'diminishing marginal gains' beyond 2 shots is the closest to a negative finding, but no configurations that hurt performance or approaches that were tried and abandoned are reported."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Abstract claims of CodeBLEU improvement from 0.64→0.73 (HPC) and 0.52→0.60 (Numerical Recipes), representing 14% and 15% relative improvement, are directly supported by Figure 2 and Table 1."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The causal claim that embedding alignment improves translation quality is supported by controlled comparison — aligned vs. unaligned embeddings are tested with the same models, datasets, and RAG pipeline, isolating the single variable of embedding alignment."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The abstract and paper consistently specify 'from Fortran to C++'. The conclusion explicitly notes 'Future work could extend this alignment strategy to additional programming languages,' acknowledging the scope limitation."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The Limitations section (Section 6) discusses multiple alternative explanations: CodeBLEU may not reflect functional equivalence, contrastive learning may misalign syntactically different but functionally similar code, and noisy training data may degrade alignment."
    132       },
    133       "proxy_outcome_distinction": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The paper explicitly acknowledges in Section 4 that 'CodeBLEU may not capture all functional nuances' and performed manual checks. The Limitations section extensively discusses how CodeBLEU (the proxy) may not translate to functional correctness (the desired outcome)."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Models are specified with family, size, and variant: 'LLaMA 3.1-8B', 'LLaMA 3.1-70B', 'Mistral123B', 'Mixtral 8x22B' (all instruct-tuned), and 'StarCoder model with 125M parameters' for embeddings."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The actual prompts used for code translation are not provided. The appendix shows example input/output but not the prompt template or system instructions given to the LLMs."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Embedding training hyperparameters are reported (temperature τ=0.1, Adam optimizer, lr=10⁻³, batch size 128). However, LLM inference parameters (temperature, top-p, max tokens) are not stated, which significantly affect generation output."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic scaffolding is used. The system is a standard RAG pipeline (embed → retrieve → generate) without retry logic, tool use, or multi-step reasoning."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 4 describes sampling criteria ('files larger than 500 bytes', 'highest combined star and fork counts'), extraction using LLaMA 3.1-70B Instruct to isolate executable Fortran code, and translation pipeline for generating C++ pairs."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 6 is a dedicated 'Limitations' section with substantial discussion spanning multiple paragraphs covering four distinct limitation areas."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The Limitations section discusses specific threats: CodeBLEU not capturing functional equivalence, contrastive learning treating syntactically different but functionally similar code as negatives, granularity issues with continuous similarity scores, and noise in training data affecting alignment quality."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The paper explicitly states scope is Fortran→C++ translation. The conclusion notes 'Future work could extend this alignment strategy to additional programming languages.' Limitations state CodeBLEU doesn't capture functional behavior."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "While evaluation datasets are public, the aligned embedding model, the 25K generated C++ translations, pairwise CodeBLEU matrices, and per-sample evaluation scores are not released for independent verification."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 4 describes sampling 25,000 Fortran snippets from Stack-V2 (>500 bytes, highest star/fork counts), using LLaMA 3.1-70B Instruct for Fortran extraction, and generating C++ translations with LLaMA 3.1-8B."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants. Data sources are standard public datasets (Stack-V2, HPC Fortran2C++, Numerical Recipes)."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "The pipeline stages are described at a high level (sample → extract → translate → compute CodeBLEU → train embeddings), but intermediate counts are missing. How many of the 500K+ Stack-V2 files survived the 500-byte and quality filtering before reaching 25K is not stated."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 7 states: 'This research was funded by the LANL ASC grant AI4Coding and the LANL Institutional Computing Program, supported by the U.S. DOE NNSA under Contract No. 89233218CNA000001.'"
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "All authors are affiliated with Los Alamos National Laboratory, clearly listed under the title. They evaluate open-source models (LLaMA, Mistral, Mixtral), not their own products."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Funding is from DOE/NNSA for general AI4Coding research. The funder does not have a commercial stake in the specific outcome of aligned vs. unaligned embedding performance."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement is present in the paper."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No training data cutoff dates are stated for any of the evaluated models (LLaMA 3.1, Mistral, Mixtral). These models could have seen the evaluation benchmarks during training."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No discussion of whether HPC Fortran2C++ (2023) or Numerical Recipes code pairs appeared in the training data of LLaMA 3.1, Mistral, or Mixtral models."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The Numerical Recipes dataset is from 1988 and its code has been widely available online for decades. HPC Fortran2C++ was published in 2023. Neither benchmark's potential presence in model training data is discussed."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No inference cost, latency, or per-translation cost is reported. The paper describes training time (15 min/epoch on 256 GPUs) but not the cost of using the aligned RAG system at inference time."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Section 4 states training was 'distributed across 256 GH200 GPUs' taking 'approximately 15 minutes per epoch, with early stopping at epoch 20.' Scaling estimates for 64 and 32 GPUs are also provided."
    291       }
    292     },
    293     "experimental_rigor": {
    294       "seed_sensitivity_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No mention of random seeds, seed sensitivity analysis, or results across multiple seeds."
    298       },
    299       "number_of_runs_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "The number of experimental runs is not explicitly stated. It is unclear whether results represent single runs or averages over multiple runs."
    303       },
    304       "hyperparameter_search_budget": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "While specific hyperparameter values are given (τ=0.1, lr=10⁻³, batch=128), no search budget, search method, or number of configurations tried is reported."
    308       },
    309       "best_config_selection_justified": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Results are reported for all configurations tested (4 models × 2 datasets × 4 shot counts × 2 alignment conditions), not just the best. Table 1 presents the full matrix."
    313       },
    314       "multiple_comparison_correction": {
    315         "applies": false,
    316         "answer": false,
    317         "justification": "No statistical tests are performed at all, so correction for multiple comparisons is not applicable. The absence of statistical testing is captured by the significance_tests item."
    318       },
    319       "self_comparison_bias_addressed": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "No acknowledgment of potential bias from evaluating their own alignment method. They implement both the aligned and unaligned pipelines without discussing whether this introduces bias."
    323       },
    324       "compute_budget_vs_performance": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The alignment requires training on 256 GH200 GPUs for 20 epochs, but performance gains are not discussed relative to this substantial compute cost. No comparison at matched compute budgets."
    328       },
    329       "benchmark_construct_validity": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "The paper explicitly discusses CodeBLEU's construct validity: Section 4 notes it 'may not capture all functional nuances' and the Limitations section extensively discusses the gap between CodeBLEU scores and functional equivalence."
    333       },
    334       "scaffold_confound_addressed": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "The RAG scaffold is held constant across all comparisons — only the embedding model (aligned vs. unaligned) varies. The same retrieval pipeline, same k values, and same LLMs are used for both conditions."
    338       }
    339     },
    340     "data_leakage": {
    341       "temporal_leakage_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "No discussion of temporal leakage. The Numerical Recipes code has been publicly available since 1988, and HPC Fortran2C++ since 2023. LLaMA 3.1 and other models may have seen these in training."
    345       },
    346       "feature_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of whether the RAG retrieval examples leak information that would not be available in a real deployment scenario."
    350       },
    351       "non_independence_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether Stack-V2 (used for alignment training) shares code with the HPC Fortran2C++ or Numerical Recipes evaluation sets, which could inflate alignment effectiveness."
    355       },
    356       "leakage_detection_method": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No concrete leakage detection or prevention methods are applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines)."
    360       }
    361     }
    362   },
    363   "scan_version": 3,
    364   "active_modules": [
    365     "experimental_rigor",
    366     "data_leakage"
    367   ],
    368   "claims": [
    369     {
    370       "claim": "Task-specific embedding alignment improves average CodeBLEU from 0.64 to 0.73 (14% relative improvement) on HPC Fortran2C++ dataset.",
    371       "evidence": "Figure 2 (llama3.1 70b scatter plot) and Table 1 show aligned embeddings consistently outperform unaligned across all models and shot counts on HPC Fortran2C++.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "On the Numerical Recipes dataset, alignment improves average CodeBLEU from 0.52 to 0.60 (15% relative improvement).",
    376       "evidence": "Figure 2 (Numerical Recipe panels) and Table 1 show consistent improvement across models and shot counts.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "These gains are realized without any fine-tuning of the language model.",
    381       "evidence": "The method description (Section 3) and experimental setup (Section 4) confirm only the embedding model is trained; the LLMs (LLaMA, Mistral, Mixtral) are used as-is.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Diminishing returns are observed beyond 2-shot prompting.",
    386       "evidence": "Table 1 shows the gap between 2-shot and 3-shot improvements is smaller than 1-shot to 2-shot across most configurations.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Aligned embeddings reduce performance variability compared to unaligned embeddings.",
    391       "evidence": "Figure 3 box plots show tighter interquartile ranges for aligned configurations, and Figure 2 reports lower standard deviations (e.g., 0.17 vs 0.19 for HPC with llama3.1 70b).",
    392       "supported": "moderate"
    393     }
    394   ],
    395   "methodology_tags": [
    396     "benchmark-eval"
    397   ],
    398   "key_findings": "The paper demonstrates that aligning retrieval embeddings with task-specific CodeBLEU scores via contrastive learning (S-InfoNCE) improves Fortran-to-C++ code translation quality by 14-15% on two benchmarks without fine-tuning the LLM. The approach works consistently across four different models (LLaMA 3.1-8B/70B, Mistral 123B, Mixtral 8x22B) and shows diminishing returns beyond 2-shot prompting. Larger models benefit more from alignment, and aligned embeddings also reduce output variability.",
    399   "red_flags": [
    400     {
    401       "flag": "Single evaluation metric",
    402       "detail": "The entire evaluation relies solely on CodeBLEU, which the authors themselves acknowledge may not capture functional correctness. No compilation rate, execution correctness, or other independent metrics are reported."
    403     },
    404     {
    405       "flag": "No statistical significance tests",
    406       "detail": "Claims of 'significant improvement' are made without any formal statistical tests. With evaluation sets of only 315 and 298 pairs, the observed differences could potentially be explained by chance."
    407     },
    408     {
    409       "flag": "Small evaluation datasets",
    410       "detail": "Results are based on just 315 (HPC Fortran2C++) and 298 (Numerical Recipes) code pairs. No discussion of whether these sizes are sufficient for reliable conclusions."
    411     },
    412     {
    413       "flag": "No contamination analysis",
    414       "detail": "Numerical Recipes code has been publicly available since 1988 and widely distributed. The evaluated LLMs may have memorized these translations during training, which could confound the results."
    415     },
    416     {
    417       "flag": "Massive compute without cost-benefit analysis",
    418       "detail": "Training requires 256 GH200 GPUs for 20 epochs, but the paper does not discuss whether the 14-15% CodeBLEU improvement justifies this compute cost, nor does it report inference-time overhead."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    424       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    425       "year": 2020,
    426       "relevance": "Foundational RAG paper that established the retrieval-augmented generation framework used in this work."
    427     },
    428     {
    429       "title": "Evaluating large language models trained on code",
    430       "authors": ["Mark Chen", "Jerry Tworek"],
    431       "year": 2021,
    432       "arxiv_id": "2107.03374",
    433       "relevance": "Introduced Codex and HumanEval; foundational work on LLM-based code generation evaluation."
    434     },
    435     {
    436       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    437       "authors": ["Zhangyin Feng", "Daya Guo"],
    438       "year": 2020,
    439       "relevance": "Pre-trained code embedding model used as a baseline embedding approach in code translation RAG systems."
    440     },
    441     {
    442       "title": "Enhancing code translation in language models with few-shot learning via retrieval-augmented generation",
    443       "authors": ["Manish Bhattarai", "Javier E Santos"],
    444       "year": 2024,
    445       "arxiv_id": "2407.19619",
    446       "relevance": "Prior work by same authors establishing the RAG-based code translation framework that this paper extends with aligned embeddings."
    447     },
    448     {
    449       "title": "StarCoder: may the source be with you!",
    450       "authors": ["Raymond Li", "Loubna Ben Allal"],
    451       "year": 2023,
    452       "arxiv_id": "2305.06161",
    453       "relevance": "Open-source code LLM whose encoder (StarEncoder 125M) is used as the embedding backbone for retrieval alignment."
    454     },
    455     {
    456       "title": "CodeBLEU: a method for automatic evaluation of code synthesis",
    457       "authors": ["Shuo Ren", "Daya Guo"],
    458       "year": 2020,
    459       "arxiv_id": "2009.10297",
    460       "relevance": "Defines the CodeBLEU metric used both as the evaluation metric and as the alignment signal for embedding training."
    461     },
    462     {
    463       "title": "StarCoder 2 and the Stack V2: The next generation",
    464       "authors": ["Anton Lozhkov", "Raymond Li"],
    465       "year": 2024,
    466       "arxiv_id": "2402.19173",
    467       "relevance": "Provides the Stack-V2 dataset (500K+ Fortran snippets) used as the source corpus for embedding alignment training."
    468     },
    469     {
    470       "title": "Granite code models: A family of open foundation models for code intelligence",
    471       "authors": ["Mayank Mishra", "Matt Stallone"],
    472       "year": 2024,
    473       "arxiv_id": "2405.04324",
    474       "relevance": "Open-source code model family; cited for task-specific alignment approaches in code intelligence."
    475     },
    476     {
    477       "title": "Creating a dataset for high-performance computing code translation using LLMs: A bridge between OpenMP Fortran and C++",
    478       "authors": ["Bin Lei", "Caiwen Ding"],
    479       "year": 2023,
    480       "relevance": "Created the HPC Fortran2C++ benchmark dataset (315 pairs) used as one of the two evaluation benchmarks in this paper."
    481     },
    482     {
    483       "title": "LLaMA: Open and efficient foundation language models",
    484       "authors": ["Hugo Touvron", "Thibaut Lavril"],
    485       "year": 2023,
    486       "arxiv_id": "2302.13971",
    487       "relevance": "Foundation model family (LLaMA 3.1 variants) used as both the translation LLM and the synthetic data generator."
    488     }
    489   ],
    490   "engagement_factors": {
    491     "practical_relevance": {
    492       "score": 2,
    493       "justification": "The Fortran→C++ translation use case is practically relevant for HPC code migration, but requires 256 GPUs for alignment training and targets a niche language pair."
    494     },
    495     "surprise_contrarian": {
    496       "score": 1,
    497       "justification": "Task-specific embedding alignment improving RAG is intuitive rather than surprising; the contribution is in the specific mechanism (S-InfoNCE with CodeBLEU) rather than a counterintuitive finding."
    498     },
    499     "fear_safety": {
    500       "score": 0,
    501       "justification": "No safety, security, or risk implications."
    502     },
    503     "drama_conflict": {
    504       "score": 0,
    505       "justification": "No controversy or conflict with established results."
    506     },
    507     "demo_ability": {
    508       "score": 0,
    509       "justification": "No code, demo, or tool is released."
    510     },
    511     "brand_recognition": {
    512       "score": 1,
    513       "justification": "Los Alamos National Laboratory is well-known in HPC/scientific computing but not a major AI research brand."
    514     }
    515   }
    516 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs