scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33744B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Code Translation in Language Models with Few-Shot Learning via Retrieval-Augmented Generation",
      6     "authors": [
      7       "Manish Bhattarai",
      8       "Javier E. Santos",
      9       "Shawn Jones",
     10       "Ayan Biswas",
     11       "Boian Alexandrov"
     12     ],
     13     "year": 2024,
     14     "venue": "IEEE Conference on High Performance Extreme Computing",
     15     "arxiv_id": "2407.19619",
     16     "doi": "10.1109/HPEC62836.2024.10938485"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims 'superiority of our approach over traditional zero-shot, particularly in translating between Fortran and C++.' Table II shows positive deltas for most models in few-shot vs zero-shot, supporting this claim.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper claims RAG 'improves' and 'enhances' translation quality. The experimental design varies the RAG component (zero-shot vs 1/2/3-shot) while holding other factors constant (same model, same dataset, same prompts), which is adequate for this causal claim.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title says 'Enhancing Code Translation in Language Models' broadly, but all experiments are limited to Fortran→C++ translation only. The abstract partially qualifies with 'particularly in translating between Fortran and C++' but the title and framing suggest general code translation.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for why RAG helps. For example, it does not consider whether the improvement comes from simply having more context (longer prompts) rather than retrieval quality, or whether random example selection would perform similarly. The bad RAG experiment (Figure 5c) partially addresses retrieval quality but not alternative explanations for the overall effect.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "CodeBLEU measures textual/structural similarity to reference translations, but the paper frames results as measuring 'translation quality' and 'accuracy' without discussing the gap between textual similarity and functional correctness. No execution-based evaluation is performed.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations section. The conclusion mentions 'the current limitation in Fortran-C++ pairs challenges fine-tuning LLMs and establishing benchmarks' but this is a single sentence, not substantive discussion.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed. No mention of potential confounds, selection bias in datasets, or limitations of the experimental design.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show. No boundaries on generalization to other language pairs, different code domains, or production settings.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgments section is present in the paper. All authors are from Los Alamos National Laboratory (a DOE-funded facility) but no specific funding sources or grant numbers are disclosed.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All six authors are clearly listed with their Los Alamos National Laboratory affiliations and division names.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The authors are from a government national laboratory (LANL) and are evaluating third-party models (GPT, Llama, etc.). The presumed funder (DOE/US government) has no financial stake in which LLM performs best for code translation.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "RAG defined: 'Retrieval-Augmented Generation... maintains a repository of code translation examples and dynamically retrieves the most relevant examples'. Few-shot explained: 'the number of examples provided to the model during inference'. Code translation is clear from context (Fortran→C++).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section I clearly states: 'we propose a RAG framework that enhances Few-Shot Learning for code translation tasks... providing the model with multiple contextual examples'. Contribution is a method (RAG pipeline) + empirical evaluation across multiple LLMs and embedding models.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II provides 17 references with engagement: contrasts with Lei et al.'s fine-tuning approach ('improved 9 fold' but computationally expensive), cites 'Lost in Translation' taxonomy of LLM translation bugs, acknowledges RAG as more efficient than fine-tuning. Shows how this work differs (dynamic adaptation without retraining).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper. The RAG pipeline code is not released.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper uses publicly available datasets: Numerical Recipes (ref [20]), HPC Fortran2CPP dataset (ref [5]), and Stack-V2 (ref [21]). These are all referenced with citations to their public sources.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup is provided. The paper mentions using HuggingFace and ChromaDB but does not specify library versions or dependencies.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. The methods section describes the pipeline conceptually but not with enough detail to reproduce without guessing implementation decisions.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Table I reports mean ± standard deviation for CodeBLEU and sub-metrics across models and datasets (e.g., 'GPT-4o 0.371 ± 0.002').",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims RAG 'significantly improves translation quality' and models 'consistently outperformed others' but no statistical significance tests (t-tests, p-values, etc.) are reported. Comparisons are based solely on point estimates.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Table II reports delta CodeBLEU scores between zero-shot and few-shot settings with absolute baseline values, giving full context for the magnitude of improvement (e.g., Granite-34B from 0.237 zero-shot to +0.363 in 1-shot).",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification for dataset sizes (298 pairs, 315 pairs, 500 sampled). No power analysis or discussion of whether these sizes are sufficient for the claims made.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Table I reports standard deviations alongside mean scores. However, it is ambiguous whether this variance is across multiple runs or across dataset examples.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Zero-shot performance serves as the baseline for all models, with results compared against 1-shot, 2-shot, and 3-shot RAG configurations (Table II).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Models evaluated include GPT-4o, Llama3-70B Instruct, Codestral, and Mixtral-8x22B, which were contemporary at publication time (2024).",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The study systematically varies number of shots (0, 1, 2, 3), embedding models (Nomic-Embed, Starencoder, CodeBERT), and retrieval metrics (cosine similarity vs l2 distance), effectively ablating the RAG pipeline components.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Table I reports CodeBLEU and its four sub-components separately: Ngram Match, Weighted Ngram Match, Syntax Tree Match, and Dataflow Match, providing multiple evaluation perspectives.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of the generated translations is performed. All evaluation is automated using CodeBLEU. No expert review of translation quality or functional correctness.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "The paper does not explicitly describe train/test splits. The RAG retrieval corpus appears to be the same as the evaluation dataset, and no leave-one-out or hold-out protocol is clearly stated.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by model, dataset (HPC Fortran2CPP vs Numerical Recipes), number of shots, embedding model, and retrieval metric in Tables I-II and Figures 5-6.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper discusses StarCoder's poor performance, shows a 'bad RAG setup' experiment in Figure 5c where using the largest distance metric as retrieval degrades performance, and discusses why CodeBERT underperforms as an embedding model.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "StarCoder shows no improvement with few-shot learning (delta = 0.000 across all shots in Table II). GPT models show diminishing returns from additional shots. CodeBERT underperforms as an embedding model.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Models are identified by marketing names only: 'GPT-3.5 turbo', 'GPT-4o', 'Llama3-70B Instruct', etc. No specific version identifiers, snapshot dates, or API versions are provided.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompt templates are provided in Figures 3 (zero-shot) and 4 (few-shot), including the system prompt and user prompt structure. The variable parts are the code snippets being translated, which naturally vary per input.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No generation hyperparameters (temperature, top-p, max tokens, etc.) are reported for any of the models used. These settings significantly affect output quality.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The system is a single-pass RAG retrieval + generation pipeline without tool use, retry logic, or agent loops.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section III describes preprocessing for each dataset: 'standardized code style, removed comments, handled whitespace and special characters, and mapped Fortran subroutines to their C++ equivalents.' Stack-V2 sampling criteria (file length between 1000-10000 bytes, highest star+fork counts) are also documented.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The source datasets are publicly available, but the generated translations, CodeBLEU scores, and intermediate results are not released for verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section III describes data collection in detail: Numerical Recipes comprises 298 Fortran-C++ pairs with documented curation; HPC Fortran2CPP comprises 315 pairs from NPB, PolyBench, and DRB; Stack-V2 sampling criteria are specified (500 examples, 1000-10000 bytes, unique repos, highest star/fork counts).",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data sources are standard public benchmarks and curated datasets.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline is documented in Section III and Figure 1: dataset preparation → embedding generation → retrieval → prompt construction → translation → CodeBLEU evaluation. Preprocessing steps are described for each dataset.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the models used (GPT-4o, Llama3-70B, etc.). This is critical since the Numerical Recipes book has been public since 1988 and HPC benchmarks are widely available online.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether the evaluation datasets (particularly Numerical Recipes code, published since 1988, or public HPC benchmarks) were included in the training data of the LLMs being evaluated.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The Numerical Recipes dataset is derived from a book published in 1988; NPB, PolyBench, and DRB benchmarks are long-standing public code. Stack-V2 is explicitly from GitHub. All are likely in the training data of models like GPT-4o. No contamination analysis is performed.",
    309           "source": "opus"
    310         }
    311       },
    312       "cost_and_practicality": {
    313         "inference_cost_reported": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No inference cost, latency, or tokens consumed are reported. The paper uses both commercial API models (GPT-3.5, GPT-4o) and large open models (70B+ parameters) without reporting any cost information.",
    317           "source": "opus"
    318         },
    319         "compute_budget_stated": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No compute budget, GPU hours, hardware specifications, or total API spend are mentioned anywhere in the paper.",
    323           "source": "opus"
    324         }
    325       },
    326       "human_studies": {
    327         "pre_registered": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study. It is a purely computational benchmark evaluation.",
    331           "source": "opus"
    332         },
    333         "irb_or_ethics_approval": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "demographics_reported": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "inclusion_exclusion_criteria": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         },
    351         "randomization_described": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "opus"
    356         },
    357         "blinding_described": {
    358           "applies": false,
    359           "answer": false,
    360           "justification": "No human participants in this study.",
    361           "source": "opus"
    362         },
    363         "attrition_reported": {
    364           "applies": false,
    365           "answer": false,
    366           "justification": "No human participants in this study.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "The ± values in Table I appear to represent variance across dataset examples, not across random seeds or multiple runs. No explicit seed sensitivity analysis is reported.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is never stated. It is unclear whether results represent single runs or averages over multiple runs.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search is described. Generation parameters (temperature, etc.) are not even reported, let alone a search budget.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "All configurations are reported (Tables I-II, Figures 5-6) across all models, shots, embeddings, and retrieval metrics. The paper does not cherry-pick a single best configuration.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "Many model/configuration comparisons are made with claims of superiority, but no statistical tests are performed at all, let alone corrections for multiple comparisons.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors are evaluating their own RAG method against a zero-shot baseline. No discussion of self-evaluation bias or independent replication.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "RAG adds retrieval overhead (embedding generation, vector search, longer prompts) compared to zero-shot. Using 3-shot means 3x more context than 0-shot. These compute differences are not discussed or controlled for.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "CodeBLEU measures textual and structural similarity to reference translations, which is not the same as functional correctness. The paper does not discuss whether CodeBLEU actually measures code translation quality, nor compare with execution-based metrics.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": true,
    422           "justification": "The same RAG pipeline and prompt templates are used consistently across all model comparisons. The scaffold (retrieval mechanism, prompt format) is held constant when comparing models.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of temporal leakage. Numerical Recipes code has been public since 1988; HPC benchmark codes (NPB, PolyBench, DRB) are long-standing public repositories. Models trained on internet data almost certainly saw these codes.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the evaluation setup leaks information. In the RAG setting, retrieved examples from the same corpus may provide near-identical solutions as hints.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether train and test examples are independent. The HPC Fortran2CPP dataset shows clear cluster structure (Figure 2a), suggesting code from the same benchmark suite may be highly similar.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, decontamination, or temporal splits.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "RAG with few-shot learning significantly improves code translation quality compared to zero-shot",
    457       "evidence": "Table II shows improvements for most models: Granite-34B +0.363 (1-shot), Mixtral +0.288 (1-shot), Llama3 +0.117 (1-shot). However, StarCoder shows +0.000 improvement.",
    458       "supported": "moderate"
    459     },
    460     {
    461       "claim": "Few-shot improvements progress monotonically with number of shots (1 > 2 > 3)",
    462       "evidence": "Table II shows inconsistent pattern: some models improve 1→2→3 (e.g., Granite 0.6→0.540→0.54), others plateau or decline (GPT-3.5 3-shot 0.188 vs 2-shot 0.176). Claim is partially supported.",
    463       "supported": "moderate"
    464     },
    465     {
    466       "claim": "Relevance of retrieved examples matters more than seeing additional examples",
    467       "evidence": "Figure 5c shows bad RAG (largest distance retrieval) performs worse than good RAG (cosine similarity) for same number of shots. However, no random-example baseline to isolate relevance effect.",
    468       "supported": "moderate"
    469     },
    470     {
    471       "claim": "Code-specific models (Granite, CodeLlama, Llama3) outperform general models in few-shot RAG",
    472       "evidence": "Table II: Granite-34B, Llama3, Mixtral top performers in few-shot. Table I: Phi-3 (3.8B, general) underperforms CodeLlama, Granite in zero-shot. Supported.",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "Nomic-Embed and Starencoder are equivalent; CodeBERT underperforms",
    477       "evidence": "Section IV.1: 'Nomic-embed and Starencoder exhibited equivalent performance'. CodeBERT attributed failure to 512-token limit vs 8192 for others. Supported.",
    478       "supported": "strong"
    479     },
    480     {
    481       "claim": "GPT models plateau in few-shot learning while open models continue improving",
    482       "evidence": "Table II: GPT-3.5 improvement +0.157 (1-shot) to +0.188 (3-shot) vs Granite +0.363→0.278→0.302. Section IV.4 attributes to GPT prioritizing 'executable code over strict alignment with ground truth'. Supported.",
    483       "supported": "strong"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "Retrieval-Augmented Generation combined with few-shot learning improves code translation accuracy (measured by CodeBLEU) for most LLMs, with improvements ranging from +0.06 to +0.37 CodeBLEU points depending on model and embedding choice. Code-specific models (Granite, Llama3, Mixtral) benefit substantially from RAG and few-shot learning (+0.3-0.4 improvement), while general models and smaller models (Phi-3, StarCoder) show minimal gains. The relevance of retrieved examples matters (bad retrieval setup hurts performance), but the work does not isolate whether improvements stem from example relevance vs. simply seeing more examples.",
    490   "red_flags": [
    491     {
    492       "flag": "Data leakage risk",
    493       "detail": "RAG retrieves examples from Numerical Recipes and HPC Fortran2CPP datasets, which are also being evaluated on. No explicit train/test split or cross-validation to prevent overlap. Retrieved examples could be from the same set being tested."
    494     },
    495     {
    496       "flag": "No human evaluation",
    497       "detail": "Only CodeBLEU automatic metric used. No validation that CodeBLEU (n-gram + syntax + dataflow) correlates with human judgments of translation correctness, readability, or practicality. CodeBLEU could be gamed."
    498     },
    499     {
    500       "flag": "Benchmark contamination not addressed",
    501       "detail": "Stack-V2 is from GitHub (likely in LLM training data). HPC benchmarks are public (likely in training). No discussion of whether results are inflated due to model memorization of benchmarks."
    502     },
    503     {
    504       "flag": "Training cutoff not stated",
    505       "detail": "Model training data cutoff dates not documented. Cannot assess whether Fortran-C++ examples or benchmarks entered training data. Implicit cutoffs inferred from release dates (unreliable)."
    506     },
    507     {
    508       "flag": "No reproduction path",
    509       "detail": "Code not released, data not released, hyperparameters incomplete (temperature, max_tokens missing), environment not specified. Claims cannot be verified independently."
    510     },
    511     {
    512       "flag": "Limited generalization evidence",
    513       "detail": "All RAG evaluation on Fortran-C++ pairs only. Stack-V2 zero-shot results (Figure 7) not paired with RAG few-shot. Claims about RAG generalize to 'code translation' broadly, but evidence is narrow."
    514     },
    515     {
    516       "flag": "No statistical significance testing",
    517       "detail": "Improvements reported without p-values or significance tests. Claims of 'superiority' and 'significant improvements' are unsupported by hypothesis testing. Improvements could be noise."
    518     },
    519     {
    520       "flag": "Incomplete ablation",
    521       "detail": "Figure 5c compares good RAG (cosine) vs bad RAG (largest distance), but no comparison with random few-shot examples to isolate whether improvement is from relevance vs. quantity."
    522     },
    523     {
    524       "flag": "Scope-title mismatch",
    525       "detail": "Title claims 'Enhancing Code Translation in Language Models' (suggests general translation), but all evidence is Fortran-C++ specific. Generalization claims exceed evidence."
    526     }
    527   ],
    528   "cited_papers": [
    529     {
    530       "title": "Evaluating Large Language Models Trained on Code",
    531       "relevance": "Foundational work on code LLMs (Codex, HumanEval); establishes LLM capability benchmarks for code."
    532     },
    533     {
    534       "title": "Creating a Dataset for High-Performance Computing Code Translation using LLMs: A Bridge Between OpenMP Fortran and C++",
    535       "relevance": "Directly related; introduces HPC Fortran-C++ dataset used in this paper; establishes baseline for code translation benchmarking."
    536     },
    537     {
    538       "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models while Translating Code",
    539       "relevance": "Taxonomy of LLM translation errors; identifies bug categories that CodeBLEU metric may miss."
    540     },
    541     {
    542       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    543       "relevance": "Foundational RAG paper; establishes retrieval-augmented generation framework adapted here for code translation."
    544     },
    545     {
    546       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    547       "relevance": "Fine-tuning alternative to RAG; this paper argues RAG is more efficient than LoRA for code translation."
    548     },
    549     {
    550       "title": "Code Llama: Open Foundation Models for Code",
    551       "relevance": "Code-specific LLM evaluated in experiments; represents state-of-art open model for code tasks."
    552     },
    553     {
    554       "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence",
    555       "relevance": "IBM code model family evaluated; top performer in few-shot RAG settings in this study."
    556     },
    557     {
    558       "title": "Nomic Embed: Training a Reproducible Long Context Text Embedder",
    559       "relevance": "Embedding model used for RAG retrieval; outperforms CodeBERT for code due to 8192-token context vs 512."
    560     }
    561   ],
    562   "engagement_factors": {
    563     "practical_relevance": {
    564       "score": 2,
    565       "justification": "RAG-augmented few-shot code translation is a technique practitioners could apply to legacy code migration, though no tool is released."
    566     },
    567     "surprise_contrarian": {
    568       "score": 0,
    569       "justification": "Results confirm the expected finding that providing relevant examples (few-shot) improves LLM performance over zero-shot."
    570     },
    571     "fear_safety": {
    572       "score": 0,
    573       "justification": "No safety, security, or AI risk concerns are raised."
    574     },
    575     "drama_conflict": {
    576       "score": 0,
    577       "justification": "No controversy or conflict; straightforward empirical evaluation."
    578     },
    579     "demo_ability": {
    580       "score": 0,
    581       "justification": "No code, demo, or tool is released for others to try."
    582     },
    583     "brand_recognition": {
    584       "score": 1,
    585       "justification": "Authors are from Los Alamos National Laboratory (recognized in HPC). Paper evaluates well-known models (GPT-4o, Llama3) but is not from a major AI lab."
    586     }
    587   },
    588   "hn_data": {
    589     "threads": [
    590       {
    591         "hn_id": "39575314",
    592         "title": "An observational study of programming and cannabis intoxication",
    593         "points": 57,
    594         "comments": 101,
    595         "url": "https://news.ycombinator.com/item?id=39575314"
    596       },
    597       {
    598         "hn_id": "40533295",
    599         "title": "Easy Problems That LLMs Get Wrong",
    600         "points": 5,
    601         "comments": 2,
    602         "url": "https://news.ycombinator.com/item?id=40533295"
    603       },
    604       {
    605         "hn_id": "40147402",
    606         "title": "OpenELM: An Efficient Language Model Family by Apple",
    607         "points": 4,
    608         "comments": 0,
    609         "url": "https://news.ycombinator.com/item?id=40147402"
    610       },
    611       {
    612         "hn_id": "40141376",
    613         "title": "OpenELM: An Efficient Language Model Family with Open-Source Training, Inference",
    614         "points": 3,
    615         "comments": 0,
    616         "url": "https://news.ycombinator.com/item?id=40141376"
    617       },
    618       {
    619         "hn_id": "44719165",
    620         "title": "Ultracoarse Equilibria and Ordinal-Folding Dynamics, Infinite Multi-Agent Games",
    621         "points": 2,
    622         "comments": 1,
    623         "url": "https://news.ycombinator.com/item?id=44719165"
    624       },
    625       {
    626         "hn_id": "42185270",
    627         "title": "Generative AI Usage and Exam Performance [pdf]",
    628         "points": 1,
    629         "comments": 0,
    630         "url": "https://news.ycombinator.com/item?id=42185270"
    631       },
    632       {
    633         "hn_id": "40145156",
    634         "title": "OpenELM: Efficient Language Model Family with Open-Source Training and Inference",
    635         "points": 1,
    636         "comments": 0,
    637         "url": "https://news.ycombinator.com/item?id=40145156"
    638       }
    639     ],
    640     "top_points": 57,
    641     "total_points": 73,
    642     "total_comments": 104
    643   }
    644 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs