scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28802B)
      1 {
      2   "paper": {
      3     "title": "Enhancing Code Translation in Language Models with Few-Shot Learning via Retrieval-Augmented Generation",
      4     "authors": [
      5       "Manish Bhattarai",
      6       "Javier E. Santos",
      7       "Shawn Jones",
      8       "Ayan Biswas",
      9       "Boian Alexandrov",
     10       "Daniel O'Malley"
     11     ],
     12     "year": 2024,
     13     "venue": "IEEE Conference on High Performance Extreme Computing (HPEC)",
     14     "arxiv_id": "2407.19619",
     15     "doi": "10.1109/HPEC62836.2024.10938485"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper. The RAG pipeline code is not released."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper uses publicly available datasets: Numerical Recipes (ref [20]), HPC Fortran2CPP dataset (ref [5]), and Stack-V2 (ref [21]). These are all referenced with citations to their public sources."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup is provided. The paper mentions using HuggingFace and ChromaDB but does not specify library versions or dependencies."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. The methods section describes the pipeline conceptually but not with enough detail to reproduce without guessing implementation decisions."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Table I reports mean ± standard deviation for CodeBLEU and sub-metrics across models and datasets (e.g., 'GPT-4o 0.371 ± 0.002')."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims RAG 'significantly improves translation quality' and models 'consistently outperformed others' but no statistical significance tests (t-tests, p-values, etc.) are reported. Comparisons are based solely on point estimates."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Table II reports delta CodeBLEU scores between zero-shot and few-shot settings with absolute baseline values, giving full context for the magnitude of improvement (e.g., Granite-34B from 0.237 zero-shot to +0.363 in 1-shot)."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification for dataset sizes (298 pairs, 315 pairs, 500 sampled). No power analysis or discussion of whether these sizes are sufficient for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Table I reports standard deviations alongside mean scores. However, it is ambiguous whether this variance is across multiple runs or across dataset examples."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Zero-shot performance serves as the baseline for all models, with results compared against 1-shot, 2-shot, and 3-shot RAG configurations (Table II)."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Models evaluated include GPT-4o, Llama3-70B Instruct, Codestral, and Mixtral-8x22B, which were contemporary at publication time (2024)."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The study systematically varies number of shots (0, 1, 2, 3), embedding models (Nomic-Embed, Starencoder, CodeBERT), and retrieval metrics (cosine similarity vs l2 distance), effectively ablating the RAG pipeline components."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table I reports CodeBLEU and its four sub-components separately: Ngram Match, Weighted Ngram Match, Syntax Tree Match, and Dataflow Match, providing multiple evaluation perspectives."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation of the generated translations is performed. All evaluation is automated using CodeBLEU. No expert review of translation quality or functional correctness."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "The paper does not explicitly describe train/test splits. The RAG retrieval corpus appears to be the same as the evaluation dataset, and no leave-one-out or hold-out protocol is clearly stated."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by model, dataset (HPC Fortran2CPP vs Numerical Recipes), number of shots, embedding model, and retrieval metric in Tables I-II and Figures 5-6."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper discusses StarCoder's poor performance, shows a 'bad RAG setup' experiment in Figure 5c where using the largest distance metric as retrieval degrades performance, and discusses why CodeBERT underperforms as an embedding model."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "StarCoder shows no improvement with few-shot learning (delta = 0.000 across all shots in Table II). GPT models show diminishing returns from additional shots. CodeBERT underperforms as an embedding model."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims 'superiority of our approach over traditional zero-shot, particularly in translating between Fortran and C++.' Table II shows positive deltas for most models in few-shot vs zero-shot, supporting this claim."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper claims RAG 'improves' and 'enhances' translation quality. The experimental design varies the RAG component (zero-shot vs 1/2/3-shot) while holding other factors constant (same model, same dataset, same prompts), which is adequate for this causal claim."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title says 'Enhancing Code Translation in Language Models' broadly, but all experiments are limited to Fortran→C++ translation only. The abstract partially qualifies with 'particularly in translating between Fortran and C++' but the title and framing suggest general code translation."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper does not discuss alternative explanations for why RAG helps. For example, it does not consider whether the improvement comes from simply having more context (longer prompts) rather than retrieval quality, or whether random example selection would perform similarly. The bad RAG experiment (Figure 5c) partially addresses retrieval quality but not alternative explanations for the overall effect."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "CodeBLEU measures textual/structural similarity to reference translations, but the paper frames results as measuring 'translation quality' and 'accuracy' without discussing the gap between textual similarity and functional correctness. No execution-based evaluation is performed."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Models are identified by marketing names only: 'GPT-3.5 turbo', 'GPT-4o', 'Llama3-70B Instruct', etc. No specific version identifiers, snapshot dates, or API versions are provided."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Full prompt templates are provided in Figures 3 (zero-shot) and 4 (few-shot), including the system prompt and user prompt structure. The variable parts are the code snippets being translated, which naturally vary per input."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "No generation hyperparameters (temperature, top-p, max tokens, etc.) are reported for any of the models used. These settings significantly affect output quality."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used. The system is a single-pass RAG retrieval + generation pipeline without tool use, retry logic, or agent loops."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section III describes preprocessing for each dataset: 'standardized code style, removed comments, handled whitespace and special characters, and mapped Fortran subroutines to their C++ equivalents.' Stack-V2 sampling criteria (file length between 1000-10000 bytes, highest star+fork counts) are also documented."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "There is no dedicated limitations section. The conclusion mentions 'the current limitation in Fortran-C++ pairs challenges fine-tuning LLMs and establishing benchmarks' but this is a single sentence, not substantive discussion."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No threats to validity are discussed. No mention of potential confounds, selection bias in datasets, or limitations of the experimental design."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper does not explicitly state what the results do NOT show. No boundaries on generalization to other language pairs, different code domains, or production settings."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The source datasets are publicly available, but the generated translations, CodeBLEU scores, and intermediate results are not released for verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section III describes data collection in detail: Numerical Recipes comprises 298 Fortran-C++ pairs with documented curation; HPC Fortran2CPP comprises 315 pairs from NPB, PolyBench, and DRB; Stack-V2 sampling criteria are specified (500 examples, 1000-10000 bytes, unique repos, highest star/fork counts)."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. Data sources are standard public benchmarks and curated datasets."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The full pipeline is documented in Section III and Figure 1: dataset preparation → embedding generation → retrieval → prompt construction → translation → CodeBLEU evaluation. Preprocessing steps are described for each dataset."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding acknowledgments section is present in the paper. All authors are from Los Alamos National Laboratory (a DOE-funded facility) but no specific funding sources or grant numbers are disclosed."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "All six authors are clearly listed with their Los Alamos National Laboratory affiliations and division names."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The authors are from a government national laboratory (LANL) and are evaluating third-party models (GPT, Llama, etc.). The presumed funder (DOE/US government) has no financial stake in which LLM performs best for code translation."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is present in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No training data cutoff dates are stated for any of the models used (GPT-4o, Llama3-70B, etc.). This is critical since the Numerical Recipes book has been public since 1988 and HPC benchmarks are widely available online."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No discussion of whether the evaluation datasets (particularly Numerical Recipes code, published since 1988, or public HPC benchmarks) were included in the training data of the LLMs being evaluated."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The Numerical Recipes dataset is derived from a book published in 1988; NPB, PolyBench, and DRB benchmarks are long-standing public code. Stack-V2 is explicitly from GitHub. All are likely in the training data of models like GPT-4o. No contamination analysis is performed."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study. It is a purely computational benchmark evaluation."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No inference cost, latency, or tokens consumed are reported. The paper uses both commercial API models (GPT-3.5, GPT-4o) and large open models (70B+ parameters) without reporting any cost information."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No compute budget, GPU hours, hardware specifications, or total API spend are mentioned anywhere in the paper."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The ± values in Table I appear to represent variance across dataset examples, not across random seeds or multiple runs. No explicit seed sensitivity analysis is reported."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs is never stated. It is unclear whether results represent single runs or averages over multiple runs."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No hyperparameter search is described. Generation parameters (temperature, etc.) are not even reported, let alone a search budget."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "All configurations are reported (Tables I-II, Figures 5-6) across all models, shots, embeddings, and retrieval metrics. The paper does not cherry-pick a single best configuration."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Many model/configuration comparisons are made with claims of superiority, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors are evaluating their own RAG method against a zero-shot baseline. No discussion of self-evaluation bias or independent replication."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "RAG adds retrieval overhead (embedding generation, vector search, longer prompts) compared to zero-shot. Using 3-shot means 3x more context than 0-shot. These compute differences are not discussed or controlled for."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "CodeBLEU measures textual and structural similarity to reference translations, which is not the same as functional correctness. The paper does not discuss whether CodeBLEU actually measures code translation quality, nor compare with execution-based metrics."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "The same RAG pipeline and prompt templates are used consistently across all model comparisons. The scaffold (retrieval mechanism, prompt format) is held constant when comparing models."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of temporal leakage. Numerical Recipes code has been public since 1988; HPC benchmark codes (NPB, PolyBench, DRB) are long-standing public repositories. Models trained on internet data almost certainly saw these codes."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the evaluation setup leaks information. In the RAG setting, retrieved examples from the same corpus may provide near-identical solutions as hints."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether train and test examples are independent. The HPC Fortran2CPP dataset shows clear cluster structure (Figure 2a), suggesting code from the same benchmark suite may be highly similar."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, decontamination, or temporal splits."
    362       }
    363     }
    364   },
    365   "scan_version": 3,
    366   "active_modules": ["experimental_rigor", "data_leakage"],
    367   "claims": [
    368     {
    369       "claim": "RAG-based few-shot learning significantly improves code translation quality over zero-shot for most LLMs.",
    370       "evidence": "Table II shows positive CodeBLEU deltas for 7 of 9 models on HPC Fortran2CPP (e.g., Granite-34B improves from 0.237 to 0.600 with 1-shot). Similar improvements on Numerical Recipes.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Higher RAG similarity scores between retrieved examples and the query correlate with higher CodeBLEU scores.",
    375       "evidence": "Figure 5a-b shows this correlation visually. Figure 5c shows performance degrades when using the largest (worst) distance metric for retrieval.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Granite-34B, Llama3-70B, and Mixtral-8x22B are the top performers in few-shot settings.",
    380       "evidence": "Tables I-II and Figure 6 show these models achieving highest CodeBLEU improvements from RAG across both datasets. Granite-34B shows +0.363, Mixtral +0.288 on HPC Fortran2CPP 1-shot.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "GPT-4 Turbo and GPT-3.5 Turbo excel in zero-shot settings but show limited incremental improvement with more shots.",
    385       "evidence": "Table I shows GPT models have highest zero-shot scores (0.371, 0.367). Table II shows their deltas are smaller than open models in relative terms, though still positive.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "CodeBERT underperforms as an embedding model compared to Nomic-Embed and Starencoder due to its 512 token limit.",
    390       "evidence": "Section IV states CodeBERT 'consistently underperformed' and attributes this to its 512 token limit vs 8192 for the other two. No quantitative comparison table is provided for embedding model ablation.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "StarCoder shows no improvement from few-shot learning.",
    395       "evidence": "Table II shows delta = 0.000 for StarCoder across 1, 2, and 3 shots on both datasets. Authors attribute this to smaller model size and context length.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "methodology_tags": ["benchmark-eval"],
    400   "key_findings": "RAG-based few-shot learning improves Fortran-to-C++ code translation over zero-shot for most LLMs, with Granite-34B showing the largest gains (0.237→0.600 CodeBLEU). Higher retrieval similarity correlates with better translation quality. GPT models lead in zero-shot but gain less from few-shot, while open models like Mixtral-8x22B and Llama3-70B show substantial improvements. CodeBLEU-only evaluation without functional correctness testing limits the strength of conclusions.",
    401   "red_flags": [
    402     {
    403       "flag": "No execution-based evaluation",
    404       "detail": "All evaluation uses CodeBLEU (textual/structural similarity) only. No translated code is compiled or executed to verify functional correctness. High CodeBLEU does not guarantee the translation works."
    405     },
    406     {
    407       "flag": "Severe contamination risk",
    408       "detail": "Numerical Recipes code has been public since 1988. NPB, PolyBench, and DRB benchmarks are long-standing public code widely indexed on GitHub. Models like GPT-4o almost certainly saw these during pre-training, making zero-shot scores unreliable as true baselines."
    409     },
    410     {
    411       "flag": "Unclear train/test separation in RAG",
    412       "detail": "The retrieval corpus appears to be the same as the evaluation dataset. When translating example i, the system may retrieve near-identical examples from the same benchmark suite. The similarity matrix (Figure 2a) shows strong clusters confirming high intra-dataset similarity."
    413     },
    414     {
    415       "flag": "No statistical significance tests",
    416       "detail": "Claims of 'superiority' and models 'outperforming' others are based solely on point comparisons without any statistical testing across the large number of model/configuration comparisons."
    417     },
    418     {
    419       "flag": "Missing generation hyperparameters",
    420       "detail": "Temperature, top-p, max tokens, and other critical generation parameters are not reported for any model. These settings significantly affect output quality and reproducibility."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Evaluating Large Language Models Trained on Code",
    426       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    427       "year": 2021,
    428       "arxiv_id": "2107.03374",
    429       "relevance": "Introduced Codex and HumanEval benchmark; foundational work on LLM code generation evaluation."
    430     },
    431     {
    432       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    433       "authors": ["Z. Feng", "D. Guo", "D. Tang"],
    434       "year": 2020,
    435       "arxiv_id": "2002.08155",
    436       "relevance": "Pre-trained model for code used as an embedding model in this study's RAG pipeline."
    437     },
    438     {
    439       "title": "Creating a Dataset for High-Performance Computing Code Translation using LLMs: A Bridge Between OpenMP Fortran and C++",
    440       "authors": ["B. Lei", "C. Ding", "L. Chen"],
    441       "year": 2023,
    442       "relevance": "Created the HPC Fortran2CPP dataset used as a primary evaluation benchmark; demonstrated LLM fine-tuning for code translation."
    443     },
    444     {
    445       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    446       "authors": ["P. Lewis", "E. Perez", "A. Piktus"],
    447       "year": 2020,
    448       "relevance": "Foundational RAG paper that this work builds upon for code translation."
    449     },
    450     {
    451       "title": "StarCoder: may the source be with you!",
    452       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    453       "year": 2023,
    454       "relevance": "Open code LLM evaluated in this study; also provides the Starencoder embedding model."
    455     },
    456     {
    457       "title": "Code Llama: Open Foundation Models for Code",
    458       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    459       "year": 2023,
    460       "arxiv_id": "2308.12950",
    461       "relevance": "Open code LLM evaluated in this study for code translation capabilities."
    462     },
    463     {
    464       "title": "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis",
    465       "authors": ["S. Ren", "D. Guo", "S. Lu"],
    466       "year": 2020,
    467       "arxiv_id": "2009.10297",
    468       "relevance": "Defines the CodeBLEU metric used as the sole evaluation measure in this study."
    469     },
    470     {
    471       "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models while Translating Code",
    472       "authors": ["R. Pan", "A. R. Ibrahimzada", "R. Krishna"],
    473       "year": 2023,
    474       "relevance": "Studies LLM-introduced bugs during code translation, providing taxonomy of translation errors and iterative prompting approaches."
    475     },
    476     {
    477       "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence",
    478       "authors": ["M. Mishra", "M. Stallone", "G. Zhang"],
    479       "year": 2024,
    480       "arxiv_id": "2405.04324",
    481       "relevance": "Open code LLM evaluated in this study; one of the top performers in few-shot code translation."
    482     },
    483     {
    484       "title": "Mixtral of Experts",
    485       "authors": ["A. Q. Jiang", "A. Sablayrolles", "A. Roux"],
    486       "year": 2024,
    487       "arxiv_id": "2401.04088",
    488       "relevance": "Mixture-of-experts LLM evaluated in this study; among top performers for few-shot code translation."
    489     },
    490     {
    491       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    492       "authors": ["E. J. Hu", "Y. Shen", "P. Wallis"],
    493       "year": 2021,
    494       "relevance": "Parameter-efficient fine-tuning method discussed as an alternative to RAG for adapting LLMs to code translation."
    495     }
    496   ],
    497   "engagement_factors": {
    498     "practical_relevance": {
    499       "score": 2,
    500       "justification": "RAG-augmented few-shot code translation is a technique practitioners could apply to legacy code migration, though no tool is released."
    501     },
    502     "surprise_contrarian": {
    503       "score": 0,
    504       "justification": "Results confirm the expected finding that providing relevant examples (few-shot) improves LLM performance over zero-shot."
    505     },
    506     "fear_safety": {
    507       "score": 0,
    508       "justification": "No safety, security, or AI risk concerns are raised."
    509     },
    510     "drama_conflict": {
    511       "score": 0,
    512       "justification": "No controversy or conflict; straightforward empirical evaluation."
    513     },
    514     "demo_ability": {
    515       "score": 0,
    516       "justification": "No code, demo, or tool is released for others to try."
    517     },
    518     "brand_recognition": {
    519       "score": 1,
    520       "justification": "Authors are from Los Alamos National Laboratory (recognized in HPC). Paper evaluates well-known models (GPT-4o, Llama3) but is not from a major AI lab."
    521     }
    522   }
    523 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs