scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24698B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Code Translation in Language Models with Few-Shot Learning via Retrieval-Augmented Generation",
      6     "authors": [
      7       "Manish Bhattarai",
      8       "Javier E. Santos",
      9       "Shawn Jones",
     10       "Ayan Biswas",
     11       "Boian Alexandrov"
     12     ],
     13     "year": 2024,
     14     "venue": "IEEE Conference on High Performance Extreme Computing",
     15     "arxiv_id": "2407.19619",
     16     "doi": "10.1109/HPEC62836.2024.10938485"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims ('significantly improves translation quality', 'superior approach') are supported by Tables I–II showing CodeBLEU improvements across models.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Zero-shot vs. few-shot RAG comparison supports causal claims. Figure 5c ablation (bad RAG setup) demonstrates retrieval mechanism impact.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Results bounded to Fortran→C++ translation on three specific datasets. Title is broad but content is appropriately scoped.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Paper explains model performance variance (GPT plateau vs. code-specific models) but does not explore alternative explanations for WHY RAG works or when it fails.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "CodeBLEU metric is explicitly designed to measure code translation quality with four components (N-gram, syntax, dataflow); distinction between measurement and claim is clear.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Section V is 'Conclusion and Future Work' with minimal limitations discussion. One sentence mentions 'current limitation in Fortran-C++ pairs' but no dedicated threats-to-validity section.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats identified. Mentions dataset scarcity but not other threats like generalization to other language pairs, overfitting to translation patterns, or validation design limitations.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Results are specific to Fortran-C++ but scope boundaries are implicit, not explicitly stated. No discussion of what results do NOT show.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source disclosed despite all authors being at Los Alamos National Laboratory, a federally funded institution.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors list Los Alamos National Laboratory affiliation with specific divisions.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding disclosed; cannot assess independence.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests provided.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "RAG, CodeBLEU, few-shot learning, and embedding models are all defined with mathematical formulations (Section III) and metric explanations.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Contribution is explicit: RAG framework for code translation with evaluation across multiple LLM models, embedding models, and shot counts.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II reviews code translation history, fine-tuning approaches, and shows how RAG differs (more flexible, dynamic adaptation without retraining).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository or release mentioned. Paper describes methodology but provides no reproducible implementation.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "HPC Fortran2CPP availability unclear; Numerical Recipes is public but custom preprocessing applied; Stack-V2 is public but custom 500-example subset not explicitly released.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or explicit dependency/version specifications. Mentions Hugging Face and ChromaDB but not precise versions.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Pipeline steps described (Fig. 1) and prompt templates shown (Figs. 3–4) but no step-by-step reproduction instructions or hyperparameter details for replication.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Table I reports means with standard deviations (±) for zero-shot CodeBLEU across models and metrics.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests (t-tests, ANOVA) or p-values reported despite comparative claims.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Table II reports absolute CodeBLEU improvements (e.g., Granite-34B: +0.363 one-shot) with baseline context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Sample sizes (298, 315, 500 examples) provided but not justified. No power analysis or rationale for choosing these sizes.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Table I shows mean ± std dev; individual data points visible in scatter plots (Fig. 5). Variance comprehensively reported for zero-shot, less so for few-shot.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Zero-shot vs. few-shot comparison across models, embedding types, and shot numbers (0–3).",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Uses 2024-contemporary models: GPT-4o, Llama3-70B, CodeLlama-34B, Granite-34B, Mixtral-8x22B.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "Figure 5c shows RAG with bad retrieval (largest distance), but no systematic ablation of embedding models, shot counts, or dataset components.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "CodeBLEU decomposed into four components (N-gram, Weighted N-gram, Syntax Tree, Dataflow); retrieval metrics (cosine, L2) compared.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of code quality. CodeBLEU is automatic; no usability or correctness assessment by domain experts.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "No explicit mention of test/train split or held-out validation. Unclear if evaluation is on training data or separate test set.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Breakdowns provided by model, dataset, and shot count. Missing: complexity-based, bug-type, or language-feature breakdowns.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No examples of failed translations, incorrect outputs, or worst-case scenarios shown or analyzed.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "StarCoder shows 0.000 improvement (negative). CodeBERT underperformance noted. Some negative results visible but not prominently discussed.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Model names given (GPT-4o, Llama3-70B) but OpenAI versions not dated; Hugging Face models require explicit snapshot lookup not provided.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figures 3 and 4 explicitly show zero-shot and few-shot prompt templates used in experiments.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Shot counts (1, 2, 3) and retrieval metrics (cosine, L2) specified. Missing: temperature, top-p, top-k, max tokens, and embedding model hyperparameters.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Figure 1 pipeline clearly shows embedding generation → retrieval → LLM inference steps. RAG mechanism described mathematically and visually.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Preprocessing steps documented: code style standardization, comment removal, whitespace handling for Numerical Recipes; file length filtering (1000–10K bytes) for Stack-V2.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Fortran and C++ code snippets not released. Datasets cited but custom subsets and preprocessing outputs not publicly available.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Numerical Recipes: manual curation with style standardization. HPC: derived from Lei et al. (2023). Stack-V2: GitHub sampling with length/quality filters.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; N/A.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Figure 1 shows full pipeline: preprocessing → embedding → retrieval → few-shot prompt construction. Steps documented in text.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates provided for GPT or open models. Critical for Fortran-C++ evaluation risk assessment.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of train/test overlap. Stack-V2 (from GitHub) likely in training data of recent LLMs; no decontamination attempted.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "No discussion of whether benchmark examples existed before model training. Risk unaddressed, especially for GitHub-derived datasets.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants; N/A.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants; N/A.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants; N/A.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants; N/A.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants; N/A.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants; N/A.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants; N/A.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No latency, memory, or API cost reported. Relevant for practitioners adopting RAG for code translation.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total compute budget, GPU hours, or cost for running experiments mentioned.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "RAG-based few-shot learning significantly improves code translation quality over zero-shot",
    375       "evidence": "Table II: Granite-34B improves from 0.237 (zero-shot) to 0.600 (one-shot) on HPC dataset; mean improvement +0.363 CodeBLEU",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Code-specialized LLMs outperform general-purpose models for Fortran-to-C++ translation",
    380       "evidence": "Table I: CodeLlama-34B (0.243), Granite-34B (0.237) consistently outperform Phi-3 (0.228) in zero-shot; specialized training data is causal factor",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Similarity of retrieved examples directly correlates with translation quality",
    385       "evidence": "Figure 5 scatter plots show positive correlation between RAG similarity score (color) and CodeBLEU outcome. Figure 5c (bad retrieval) confirms causality",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Nomic-Embed and Starencoder are superior embedding models for code retrieval compared to CodeBERT",
    390       "evidence": "Section IV: 'CodeBERT consistently underperformed...likely due to 512-token limit vs. 8192 for others'. CodeLlama-34B with Nomic: 0.243→0.321 (two-shot); CodeBERT showed no comparable gains",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "More shots (up to 3) improve translation quality; benefits plateau or slightly decline at 3 shots",
    395       "evidence": "Table II: one-shot to three-shot gains continue (e.g., Codestral: +0.074 → +0.158 on HPC), but some models show decline (Granite: +0.363 → +0.302 from 1-shot to 3-shot)",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "HPC Fortran2CPP dataset yields higher CodeBLEU scores than Numerical Recipes due to less code complexity",
    400       "evidence": "Section IV: 'HPC dataset contains more standardized and less complex code'; Granite-34B achieves 0.6 on HPC vs. 0.49±0.20 on Numerical Recipes (one-shot CodeBERT)",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "empirical"
    407   ],
    408   "key_findings": "RAG-enhanced few-shot prompting significantly improves Fortran-to-C++ translation across multiple LLM models, with CodeBLEU improvements up to +0.367 (Mixtral-8x22B on Numerical Recipes, three-shot). Code-specialized models (Llama3-70B, Granite-34B, Mixtral-8x22B) outperform general models and show stronger gains from few-shot RAG. The similarity of retrieved examples directly correlates with translation quality, validating dynamic in-context learning without retraining—a more flexible alternative to fine-tuning.",
    409   "red_flags": [
    410     {
    411       "flag": "No statistical significance testing",
    412       "detail": "Improvements reported as absolute CodeBLEU deltas without p-values, confidence intervals at point estimates, or significance tests. Cannot determine if improvements are noise or real."
    413     },
    414     {
    415       "flag": "No human evaluation",
    416       "detail": "CodeBLEU is automatic metric; no domain expert assessment of translation correctness, maintainability, or runtime behavior. Metric may not correlate with actual code quality."
    417     },
    418     {
    419       "flag": "Code and data not released",
    420       "detail": "No repository, GitHub link, or dataset release. Reproducibility impossible; claims cannot be independently verified."
    421     },
    422     {
    423       "flag": "Training data contamination not discussed",
    424       "detail": "Stack-V2 sourced from GitHub (likely in training data of models evaluated). HPC Fortran2CPP dataset from Lei et al. (2023) may also be in training cutoff. Risk unaddressed."
    425     },
    426     {
    427       "flag": "Limited ablation studies",
    428       "detail": "Only Figure 5c shows bad RAG setup. No ablation of embedding components, dataset features, or prompt design. Cannot isolate which design choices matter most."
    429     },
    430     {
    431       "flag": "No failure case analysis",
    432       "detail": "No examples of incorrect translations, syntax errors, semantic faults, or worst-case scenarios. Unknown when RAG helps vs. hurts."
    433     },
    434     {
    435       "flag": "Sample sizes not justified",
    436       "detail": "Datasets of 298–500 examples; no power analysis or justification. May be too small for stable conclusions across language pairs."
    437     },
    438     {
    439       "flag": "Model versions underspecified",
    440       "detail": "GPT-4o and GPT-3.5 versions not dated; open models on Hugging Face require explicit snapshot IDs for reproducibility, not provided."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "Evaluating Large Language Models Trained on Code (Codex)",
    446       "relevance": "Foundational LLM for code generation; comparison baseline for code translation capability"
    447     },
    448     {
    449       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    450       "relevance": "Code embedding model used for retrieval in RAG pipeline; evaluated for performance comparison"
    451     },
    452     {
    453       "title": "Large Language Models are Zero-Shot Reasoners",
    454       "relevance": "Zero-shot prompting technique; baseline approach compared against few-shot RAG"
    455     },
    456     {
    457       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    458       "relevance": "RAG framework foundation; core methodology adapted for code translation"
    459     },
    460     {
    461       "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models while Translating Code",
    462       "relevance": "Code translation pitfalls and bug taxonomy; motivation for improving LLM translation quality"
    463     },
    464     {
    465       "title": "Creating a Dataset for High-Performance Computing Code Translation using LLMs",
    466       "relevance": "Source of HPC Fortran-C++ dataset used in experiments; prior work on LLM code translation"
    467     },
    468     {
    469       "title": "Code Llama: Open Foundation Models for Code",
    470       "relevance": "Code-specialized model evaluated; demonstrates code-specific pretraining benefit"
    471     },
    472     {
    473       "title": "StarCoder: may the source be with you!",
    474       "relevance": "Code generation model and embedding model (Starencoder) evaluated for translation and retrieval"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 2,
    480       "justification": "RAG framework is practical for Fortran-C++ legacy modernization, but limited to one language pair and code/data not released."
    481     },
    482     "surprise_contrarian": {
    483       "score": 1,
    484       "justification": "Few-shot learning benefits are well-established; RAG application to code is incremental—no surprising findings or contradictions to conventional wisdom."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No safety, alignment, or security concerns raised. Translation task is inherently safe."
    489     },
    490     "drama_conflict": {
    491       "score": 0,
    492       "justification": "No controversy, debate, or conflict angle. Technical benchmarking paper with no social/ethical dimension."
    493     },
    494     "demo_ability": {
    495       "score": 1,
    496       "justification": "RAG pipeline requires code, embeddings, and vector database setup—all non-trivial. No released implementation limits hands-on exploration."
    497     },
    498     "brand_recognition": {
    499       "score": 1,
    500       "justification": "Los Alamos National Laboratory is recognized institution, but authors are not prominent figures in LLM/code research."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "39575314",
    507         "title": "An observational study of programming and cannabis intoxication",
    508         "points": 57,
    509         "comments": 101,
    510         "url": "https://news.ycombinator.com/item?id=39575314"
    511       },
    512       {
    513         "hn_id": "40533295",
    514         "title": "Easy Problems That LLMs Get Wrong",
    515         "points": 5,
    516         "comments": 2,
    517         "url": "https://news.ycombinator.com/item?id=40533295"
    518       },
    519       {
    520         "hn_id": "40147402",
    521         "title": "OpenELM: An Efficient Language Model Family by Apple",
    522         "points": 4,
    523         "comments": 0,
    524         "url": "https://news.ycombinator.com/item?id=40147402"
    525       },
    526       {
    527         "hn_id": "40141376",
    528         "title": "OpenELM: An Efficient Language Model Family with Open-Source Training, Inference",
    529         "points": 3,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=40141376"
    532       },
    533       {
    534         "hn_id": "44719165",
    535         "title": "Ultracoarse Equilibria and Ordinal-Folding Dynamics, Infinite Multi-Agent Games",
    536         "points": 2,
    537         "comments": 1,
    538         "url": "https://news.ycombinator.com/item?id=44719165"
    539       },
    540       {
    541         "hn_id": "42185270",
    542         "title": "Generative AI Usage and Exam Performance [pdf]",
    543         "points": 1,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=42185270"
    546       },
    547       {
    548         "hn_id": "40145156",
    549         "title": "OpenELM: Efficient Language Model Family with Open-Source Training and Inference",
    550         "points": 1,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=40145156"
    553       }
    554     ],
    555     "top_points": 57,
    556     "total_points": 73,
    557     "total_comments": 104
    558   }
    559 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs