scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31663B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Cross-Language Code Translation via Task-Specific Embedding Alignment in Retrieval-Augmented Generation",
      6     "authors": [
      7       "Manish Bhattarai",
      8       "Minh Vu",
      9       "Javier E. Santos",
     10       "Ismael Boureima",
     11       "Daniel O'Malley"
     12     ],
     13     "year": 2025,
     14     "venue": "4th International Workshop on Knowledge-Augmented Methods for NLP (KnowledgeNLP'25)",
     15     "arxiv_id": null,
     16     "doi": "10.18653/v1/2025.knowledgenlp-1.8"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims of CodeBLEU improvement from 0.64→0.73 (HPC) and 0.52→0.60 (Numerical Recipes), representing 14% and 15% relative improvement, are directly supported by Figure 2 and Table 1.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The causal claim that embedding alignment improves translation quality is supported by controlled comparison — aligned vs. unaligned embeddings are tested with the same models, datasets, and RAG pipeline, isolating the single variable of embedding alignment.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The abstract and paper consistently specify 'from Fortran to C++'. The conclusion explicitly notes 'Future work could extend this alignment strategy to additional programming languages,' acknowledging the scope limitation.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The Limitations section (Section 6) discusses multiple alternative explanations: CodeBLEU may not reflect functional equivalence, contrastive learning may misalign syntactically different but functionally similar code, and noisy training data may degrade alignment.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly acknowledges in Section 4 that 'CodeBLEU may not capture all functional nuances' and performed manual checks. The Limitations section extensively discusses how CodeBLEU (the proxy) may not translate to functional correctness (the desired outcome).",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 is a dedicated 'Limitations' section with substantial discussion spanning multiple paragraphs covering four distinct limitation areas.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The Limitations section discusses specific threats: CodeBLEU not capturing functional equivalence, contrastive learning treating syntactically different but functionally similar code as negatives, granularity issues with continuous similarity scores, and noise in training data affecting alignment quality.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states scope is Fortran→C++ translation. The conclusion notes 'Future work could extend this alignment strategy to additional programming languages.' Limitations state CodeBLEU doesn't capture functional behavior.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section 7 states: 'This research was funded by the LANL ASC grant AI4Coding and the LANL Institutional Computing Program, supported by the U.S. DOE NNSA under Contract No. 89233218CNA000001.'",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are affiliated with Los Alamos National Laboratory, clearly listed under the title. They evaluate open-source models (LLaMA, Mistral, Mixtral), not their own products.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funding is from DOE/NNSA for general AI4Coding research. The funder does not have a commercial stake in the specific outcome of aligned vs. unaligned embedding performance.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms (RAG, CodeBLEU, contrastive learning, embedding alignment, S-InfoNCE) defined in-paper or cited to prior work with precise usage.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Paper explicitly states twofold contribution: (1) demonstrate contrastive learning for task-specific RAG retrieval, (2) show optimization without LLM fine-tuning improves translation quality.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages with historical approaches (rule-based, SMT, CodeBERT, Codex, prior RAG work) and positions novel contribution of task-specific metric alignment.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository URL is provided anywhere in the paper. The aligned embedding model and pipeline code are not released.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The evaluation datasets (HPC Fortran2C++ and Numerical Recipes) and the source dataset (Stack-V2) are publicly available. However, the 25K curated Fortran snippets and generated C++ translations used for alignment are not released.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions hardware (256 GH200 GPUs) and model names but provides no software environment specifications, dependency lists, or version details for libraries used.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions, README, or scripts are provided. A researcher would need to reverse-engineer the pipeline from the method description.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Standard deviations are reported in Figure 2 (e.g., 'Avg Aligned: 0.73 ± 0.17') and box plots in Figure 3 show distributional spread including quartiles.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests (p-values, t-tests, etc.) are reported despite claims that aligned embeddings 'significantly' improve performance over unaligned. Comparisons are based solely on mean differences.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Relative improvements are reported (14% and 15%) with baseline context (e.g., 'from 0.64 to 0.73'). Table 1 shows delta values across all configurations.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification for the evaluation dataset sizes (315 and 298 pairs) or the 25,000 training snippets. No power analysis or discussion of whether these sizes are sufficient for the claims made.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Standard deviations are reported in Figure 2 across shot configurations, and Figure 3 provides box plots showing distributional spread of CodeBLEU scores.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The paper compares aligned embeddings against unaligned (generic) embeddings using the same RAG framework, and includes zero-shot (no RAG) as an additional baseline in Table 1.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include StarEncoder (2023), and the comparison setup follows Bhattarai et al. (2024). Models tested include LLaMA 3.1 (2024) and Mixtral (2024).",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The comparison of aligned vs. unaligned embeddings effectively ablates the alignment component. Table 1 shows zero-shot (no retrieval) vs. few-shot (with retrieval) across shot counts, isolating the contribution of both RAG and alignment.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "Only CodeBLEU is used as the evaluation metric. Although CodeBLEU is itself a composite of four sub-components, no additional independent metrics (e.g., compilation rate, functional correctness, BLEU, CrystalBLEU) are reported.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "The paper mentions 'a small-scale manual check on a subset of translations' (Section 4) but does not report this systematically — no sample size, no inter-rater agreement, no quantitative results. The appendix shows a single qualitative example.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The embedding model is trained on Stack-V2 data, while evaluation is conducted on separate datasets: HPC Fortran2C++ (315 pairs) and Numerical Recipes (298 pairs).",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 1 provides breakdowns by model (4 models), dataset (2 datasets), and shot count (0-3 shots). Figure 2 scatter plots show per-sample variation.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No failure cases are discussed. While scatter plots in Figure 2 show some samples where aligned embeddings perform worse than unaligned, these cases are not analyzed or explained.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "Every experiment shows improvement from alignment. The observation of 'diminishing marginal gains' beyond 2 shots is the closest to a negative finding, but no configurations that hurt performance or approaches that were tried and abandoned are reported.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Models are specified with family, size, and variant: 'LLaMA 3.1-8B', 'LLaMA 3.1-70B', 'Mistral123B', 'Mixtral 8x22B' (all instruct-tuned), and 'StarCoder model with 125M parameters' for embeddings.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The actual prompts used for code translation are not provided. The appendix shows example input/output but not the prompt template or system instructions given to the LLMs.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Embedding training hyperparameters are reported (temperature τ=0.1, Adam optimizer, lr=10⁻³, batch size 128). However, LLM inference parameters (temperature, top-p, max tokens) are not stated, which significantly affect generation output.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The system is a standard RAG pipeline (embed → retrieve → generate) without retry logic, tool use, or multi-step reasoning.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 4 describes sampling criteria ('files larger than 500 bytes', 'highest combined star and fork counts'), extraction using LLaMA 3.1-70B Instruct to isolate executable Fortran code, and translation pipeline for generating C++ pairs.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "While evaluation datasets are public, the aligned embedding model, the 25K generated C++ translations, pairwise CodeBLEU matrices, and per-sample evaluation scores are not released for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4 describes sampling 25,000 Fortran snippets from Stack-V2 (>500 bytes, highest star/fork counts), using LLaMA 3.1-70B Instruct for Fortran extraction, and generating C++ translations with LLaMA 3.1-8B.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data sources are standard public datasets (Stack-V2, HPC Fortran2C++, Numerical Recipes).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "The pipeline stages are described at a high level (sample → extract → translate → compute CodeBLEU → train embeddings), but intermediate counts are missing. How many of the 500K+ Stack-V2 files survived the 500-byte and quality filtering before reaching 25K is not stated.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the evaluated models (LLaMA 3.1, Mistral, Mixtral). These models could have seen the evaluation benchmarks during training.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether HPC Fortran2C++ (2023) or Numerical Recipes code pairs appeared in the training data of LLaMA 3.1, Mistral, or Mixtral models.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The Numerical Recipes dataset is from 1988 and its code has been widely available online for decades. HPC Fortran2C++ was published in 2023. Neither benchmark's potential presence in model training data is discussed.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or per-translation cost is reported. The paper describes training time (15 min/epoch on 256 GPUs) but not the cost of using the aligned RAG system at inference time.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Section 4 states training was 'distributed across 256 GH200 GPUs' taking 'approximately 15 minutes per epoch, with early stopping at epoch 20.' Scaling estimates for 64 and 32 GPUs are also provided.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No mention of random seeds, seed sensitivity analysis, or results across multiple seeds.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is not explicitly stated. It is unclear whether results represent single runs or averages over multiple runs.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "While specific hyperparameter values are given (τ=0.1, lr=10⁻³, batch=128), no search budget, search method, or number of configurations tried is reported.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "Results are reported for all configurations tested (4 models × 2 datasets × 4 shot counts × 2 alignment conditions), not just the best. Table 1 presents the full matrix.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": false,
    397           "answer": false,
    398           "justification": "No statistical tests are performed at all, so correction for multiple comparisons is not applicable. The absence of statistical testing is captured by the significance_tests item.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "No acknowledgment of potential bias from evaluating their own alignment method. They implement both the aligned and unaligned pipelines without discussing whether this introduces bias.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "The alignment requires training on 256 GH200 GPUs for 20 epochs, but performance gains are not discussed relative to this substantial compute cost. No comparison at matched compute budgets.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": true,
    416           "justification": "The paper explicitly discusses CodeBLEU's construct validity: Section 4 notes it 'may not capture all functional nuances' and the Limitations section extensively discusses the gap between CodeBLEU scores and functional equivalence.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": true,
    422           "justification": "The RAG scaffold is held constant across all comparisons — only the embedding model (aligned vs. unaligned) varies. The same retrieval pipeline, same k values, and same LLMs are used for both conditions.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of temporal leakage. The Numerical Recipes code has been publicly available since 1988, and HPC Fortran2C++ since 2023. LLaMA 3.1 and other models may have seen these in training.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the RAG retrieval examples leak information that would not be available in a real deployment scenario.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether Stack-V2 (used for alignment training) shares code with the HPC Fortran2C++ or Numerical Recipes evaluation sets, which could inflate alignment effectiveness.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No concrete leakage detection or prevention methods are applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines).",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "Task-specific embedding alignment via CodeBLEU-guided contrastive learning improves RAG-based code translation over generic embeddings",
    457       "evidence": "Figure 2: HPC Fortran2C++ aligned=0.73±0.17 vs unaligned=0.64±0.19; Numerical Recipes aligned=0.60±0.19 vs unaligned=0.52±0.19; consistent across 4 models and 3 shot counts",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "The method achieves 14-15% relative improvement in CodeBLEU without fine-tuning the language model",
    462       "evidence": "Abstract and Section 4: HPC 0.64→0.73 (+14%), Numerical Recipes 0.52→0.60 (+15%); paper explicitly states 'gains realized without any fine-tuning'",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Larger models (70B) consistently outperform smaller models (8B) across datasets and alignment conditions",
    467       "evidence": "Table 1 and text: 'LLaMA3.1 70B model consistently achieves higher CodeBLEU scores'; example HPC 1-shot: 70B aligned=0.710 vs 8B aligned=0.688",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "Soft-InfoNCE loss aligns embeddings to match CodeBLEU similarity distribution, pulling high-similarity pairs closer in embedding space",
    472       "evidence": "Lemma 1 proves stationary condition p*_ij = S_ij/Σ(S_ik); experiments show improved retrieval quality and translation; theoretical soundness but empirical ablation lacking",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Few-shot prompting (1-2 examples) achieves most performance gains; marginal returns beyond 2-shot",
    477       "evidence": "Table 1 shows largest deltas at 0→1-shot and 1→2-shot; smaller gains 2→3-shot; text states 'diminishing marginal gains beyond two'",
    478       "supported": "moderate"
    479     },
    480     {
    481       "claim": "Retrieved examples with aligned embeddings are more semantically and syntactically meaningful for downstream translation",
    482       "evidence": "Intuitive argument in Section 3.2; Lemma 1 characterization; experiments show CodeBLEU improvement but direct retrieval quality not independently validated",
    483       "supported": "weak"
    484     },
    485     {
    486       "claim": "Task-specific alignment improves few-shot learning capacity of LLMs without modifying the model itself",
    487       "evidence": "Table 1 shows larger improvement gains for aligned vs unaligned in few-shot settings (aligned: +0.346 at 1-shot vs unaligned: +0.262); methodology only modifies retrieval, not LLM",
    488       "supported": "strong"
    489     }
    490   ],
    491   "methodology_tags": [
    492     "benchmark-eval"
    493   ],
    494   "key_findings": "Task-specific embedding alignment via contrastive learning (S-InfoNCE loss) on CodeBLEU scores improves Fortran-to-C++ code translation in RAG frameworks by 14-15% without LLM fine-tuning, showing consistent gains across two benchmarks and four model sizes (8B-70B parameters). The method exhibits diminishing returns beyond 2-shot prompting and is robust across different model architectures (LLaMA, Mistral, Mixtral). However, CodeBLEU captures only syntactic and semantic similarity, not functional correctness, and the evaluation is limited by small test sets and minimal human validation.",
    495   "red_flags": [
    496     {
    497       "flag": "No statistical significance testing",
    498       "detail": "Improvements reported without p-values, paired t-tests, or significance thresholds. Only absolute scores report standard deviation; no confidence intervals on the improvement deltas."
    499     },
    500     {
    501       "flag": "Limited and unsystematic human evaluation",
    502       "detail": "Appendix A shows only 5 example translations with informal observation ('majority compiled and produced expected outputs'); no human raters, evaluation rubric, or inter-rater agreement metrics."
    503     },
    504     {
    505       "flag": "Test data contamination not addressed",
    506       "detail": "LLaMA training cutoff dates not provided. Numerical Recipes (1988) and HPC Fortran2C++ (Lei et al. 2023) benchmarks likely in training data of models used, potentially inflating results."
    507     },
    508     {
    509       "flag": "Proxy metric used for both training and evaluation",
    510       "detail": "CodeBLEU guides contrastive learning AND serves as sole automatic evaluation metric; no independent measure of functional correctness or compilation success reported at scale."
    511     },
    512     {
    513       "flag": "Prompts not disclosed",
    514       "detail": "Prompts used to generate C++ from Fortran are not provided; reproduction requires inferring prompt structure, limiting verifiability and replicability."
    515     },
    516     {
    517       "flag": "Code and trained models not released",
    518       "detail": "No repository, pre-trained alignment model, or implementation code mentioned; replicating the 256-GPU training (5+ hours) is infeasible for most researchers."
    519     },
    520     {
    521       "flag": "Small test set size for code generation",
    522       "detail": "Evaluation on only 315 and 298 code pairs; no analysis of how results generalize beyond these specific small benchmarks."
    523     },
    524     {
    525       "flag": "No failure case analysis",
    526       "detail": "All results show improvement in main experiments; paper does not analyze cases where alignment fails, retrieves poor examples, or hurts translation quality."
    527     },
    528     {
    529       "flag": "Incomplete theoretical validation",
    530       "detail": "Lemma 1 characterizes convergence of S-InfoNCE but lacks empirical ablation on loss function variants, temperature τ sensitivity, or direct validation that convergence improves downstream task."
    531     }
    532   ],
    533   "cited_papers": [
    534     {
    535       "title": "Evaluating Large Language Models Trained on Code (Codex)",
    536       "relevance": "Foundational code LLM evaluation; establishes CodeBERT and Codex as key models in code generation task"
    537     },
    538     {
    539       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    540       "relevance": "Introduces RAG framework central to this work; establishes retrieval+generation paradigm for knowledge-intensive tasks"
    541     },
    542     {
    543       "title": "CodeBERT: A Pre-trained Model for Programming and Natural Languages",
    544       "relevance": "Code embedding model; alternative to StarEncoder used in this work; relevant to prior generic embedding approaches"
    545     },
    546     {
    547       "title": "StarCoder: May the Source Be with You!",
    548       "relevance": "Code embedding model selected as alignment backbone in this work; key technical component"
    549     },
    550     {
    551       "title": "StarCoder 2 and the Stack V2: The Next Generation",
    552       "relevance": "Stack-V2 dataset (500K+ Fortran snippets) provides primary alignment corpus in this work"
    553     },
    554     {
    555       "title": "Creating a Dataset for High-Performance Computing Code Translation Using LLMs",
    556       "relevance": "Introduces HPC Fortran2C++ benchmark; one of two main evaluation datasets; establishes code translation as benchmark task"
    557     },
    558     {
    559       "title": "CodeBLEU: A Method for Automatic Evaluation of Code Synthesis",
    560       "relevance": "Core evaluation metric; guides contrastive learning via CodeBLEU similarity scores; captures syntactic (AST, n-gram) and semantic (dataflow) code similarity"
    561     },
    562     {
    563       "title": "Representation Learning with Contrastive Predictive Coding",
    564       "relevance": "Theoretical foundation for InfoNCE loss; basis for proposed S-InfoNCE soft-label adaptation"
    565     }
    566   ],
    567   "engagement_factors": {
    568     "practical_relevance": {
    569       "score": 2,
    570       "justification": "The Fortran→C++ translation use case is practically relevant for HPC code migration, but requires 256 GPUs for alignment training and targets a niche language pair."
    571     },
    572     "surprise_contrarian": {
    573       "score": 1,
    574       "justification": "Task-specific embedding alignment improving RAG is intuitive rather than surprising; the contribution is in the specific mechanism (S-InfoNCE with CodeBLEU) rather than a counterintuitive finding."
    575     },
    576     "fear_safety": {
    577       "score": 0,
    578       "justification": "No safety, security, or risk implications."
    579     },
    580     "drama_conflict": {
    581       "score": 0,
    582       "justification": "No controversy or conflict with established results."
    583     },
    584     "demo_ability": {
    585       "score": 0,
    586       "justification": "No code, demo, or tool is released."
    587     },
    588     "brand_recognition": {
    589       "score": 1,
    590       "justification": "Los Alamos National Laboratory is well-known in HPC/scientific computing but not a major AI research brand."
    591     }
    592   },
    593   "hn_data": {
    594     "threads": [],
    595     "top_points": 0,
    596     "total_points": 0,
    597     "total_comments": 0
    598   }
    599 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs