scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20485B)
      1 {
      2   "paper": {
      3     "title": "The New Compiler Stack: A Survey on the Synergy of LLMs and Compilers",
      4     "authors": ["Shuoming Zhang", "Jiacheng Zhao", "Qiuchu Yu", "Chunwei Xia", "Zheng Wang", "Xiaobing Feng", "Huimin Cui"],
      5     "year": 2026,
      6     "venue": "CCF Transactions on High Performance Computing",
      7     "arxiv_id": "2601.02045",
      8     "doi": "10.1007/s42514-025-00270-x"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "methodology_tags": ["meta-analysis"],
     13   "key_findings": "This systematic literature review of 159 papers proposes a multi-dimensional taxonomy for LLM-enabled compilation, categorizing work by Design Philosophy (Selector, Translator, Generator), LLM Methodology (Training-Required vs Training-Free), Level of Code Abstraction, and Task Type. The survey identifies three primary advancements: democratization of compiler development, discovery of novel optimization strategies, and broadening of compiler scope. Key challenges identified include correctness/verifiability, scalability, interpretability, and cost-effectiveness, with hybrid compiler systems seen as the most promising path forward.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code release, or analysis scripts are provided in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The curated corpus of 159 papers is not released as a structured dataset. No supplementary data files are provided."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment or tool specifications are provided for reproducing the survey methodology."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step instructions for reproducing the literature search and selection process."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Survey paper that does not run experiments or produce statistical results."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Survey paper with no experiments requiring significance testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Survey paper with no experimental comparisons requiring effect sizes."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Survey paper; no experimental sample size to justify."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "Survey paper with no experiments requiring variance reporting."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The survey does not compare against prior surveys or reviews in a structured way. While it references prior work, it does not systematically compare its coverage, methodology, or findings against existing surveys."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No structured comparison with prior surveys is made, so contemporaneity of baselines cannot be assessed."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Survey paper with no system components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Survey paper with no system to evaluate with metrics."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Survey paper; human evaluation of system outputs is not applicable."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Survey paper with no test set."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The survey provides detailed per-category breakdowns across multiple dimensions: Design Philosophy (Selector/Translator/Generator), LLM Methodology, Level of Code Abstraction, and Task Type, with Tables 2-4 summarizing papers by task and code level."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.2 discusses common challenges including correctness failures, scalability limitations, and interpretability issues. The paper notes that neural compilation 'fails to outperform existing compiler systems' and discusses where LLM approaches break down."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that early neural compilation work achieved 0% accuracy for optimized settings (§5.2), that LLM-based compilation 'is still preliminary' and 'cannot surpass traditional compilers', and that there are more high-level studies than low-level because low-level code is less amenable to LLMs."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims (taxonomy proposal, three primary benefits, correctness/scalability challenges, hybrid systems as promising direction) are all substantiated by the detailed analysis in Sections 3-6."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The survey makes descriptive/taxonomic claims about the field rather than causal claims about specific interventions."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 2.3 explicitly defines the scope, distinguishing LLM-based approaches from traditional ML techniques and purely NLP-based SE tasks. The paper is bounded to 'LLM-enabled compilation' with clear inclusion/exclusion criteria."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "As a taxonomy/survey paper with no empirical results of its own, alternative explanations for observed results are not applicable."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "Survey paper with no measurements of its own."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper that does not use any models."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Survey paper that does not use prompting."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "Survey paper with no experimental setup requiring hyperparameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "Survey paper with no agentic scaffolding."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 2.2 describes the three-phase literature search and selection protocol with counts at each stage (311 → 246 → 159), along with explicit inclusion/exclusion criteria for filtering."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The paper discusses challenges of the field (§6.2) but does not discuss limitations of its own survey methodology."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to the validity of the survey itself are discussed. The paper does not address potential biases in its literature search, selection criteria, or taxonomy design."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 2.3 explicitly defines scope boundaries, stating what is excluded: traditional ML techniques, purely NLP-based SE tasks, and studies not involving Transformer-based LLMs for compilation-related tasks."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The list of 159 papers is not provided as a downloadable dataset. Tables 2-4 list many but the full corpus is not available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 2.2 describes the data collection: searches across arXiv, Google Scholar, ACM DL, IEEE Xplore using specified keywords, plus snowballing from seminal papers identified by domain experts."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants; data source is published literature, which is a standard corpus."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Figure 1 and Section 2.2 document the full pipeline: Phase 1 (search + initial filtering → 311), Phase 2 (dedup + category assignment → 246), Phase 3 (full-text review with inclusion/exclusion criteria → 159)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Declarations section states funding: 'partially supported by National R&D Program of China (2024YFB4505603), Jiangsu Province Key R&D Program (Grant No. BG2024028) and National Natural Science Foundation of China (U23B2020, 62302479, 62232015).'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: SKLP/ICT/CAS, UCAS, and University of Leeds."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funding is from Chinese national science foundations and R&D programs, which have no commercial stake in the survey's findings about LLM-compiler research."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Declarations section explicitly states: 'the corresponding author states that there is no Conflict of interest.'"
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this survey."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this survey."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this survey."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this survey."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this survey."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this survey."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this survey."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Survey paper; no method with inference costs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Survey paper; no computational experiments."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Section 2.2 follows a structured SLR protocol citing Kitchenham et al. (2007), with a three-phase search and selection process, explicit inclusion/exclusion criteria, and a flow diagram (Figure 1) showing paper counts at each stage."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The survey does not assess the methodological quality of its 159 source papers. All papers are treated equally regardless of rigor. Section 5.2 notes 'direct comparisons are not always perfectly fair' but does not systematically assess quality."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No discussion of publication bias. The survey does not consider whether its sources skew toward positive results or whether negative results in LLM-compiler research are underrepresented."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "159 primary studies were identified through a systematic three-phase literature search protocol.",
    313       "evidence": "Section 2.2 and Figure 1 describe the protocol: 311 candidates after Phase 1, 246 after dedup in Phase 2, 159 after full-text review in Phase 3.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "LLMs are integrated into compilation through three design philosophies: Selector, Translator, and Generator.",
    318       "evidence": "Sections 3.1-3.3 describe each philosophy with multiple examples from the literature. Selector: LLM chooses from predefined options. Translator: LLM directly rewrites code. Generator: LLM generates transformation scripts.",
    319       "supported": "strong"
    320     },
    321     {
    322       "claim": "There are more studies on high-level code transformations (59+34) than low-level IR/assembly (14+16).",
    323       "evidence": "Tables 2-4 enumerate the studies. The paper counts are derived from these tables and discussed in the text after Section 4.2.3.",
    324       "supported": "strong"
    325     },
    326     {
    327       "claim": "Neural compilation accuracy has progressed from 32.8% to over 99% on ExeBench.",
    328       "evidence": "Section 5.2 and Figure 6(b) track this progression: transformer-x86 (32.8%), llm-x86 (91.72%), LEGO-Compiler (99.0%).",
    329       "supported": "moderate"
    330     },
    331     {
    332       "claim": "The field lacks standardized comparison protocols, hindering fair direct evaluation.",
    333       "evidence": "Section 5.2 notes 'direct comparisons are not always perfectly fair' due to non-uniform setups. Section 6.2.3 discusses the need for standardized benchmarks. The GPU kernel comparison caveat notes 'different works may evaluate on different GPU hardware.'",
    334       "supported": "strong"
    335     }
    336   ],
    337   "red_flags": [
    338     {
    339       "flag": "No quality assessment of source papers",
    340       "detail": "The survey treats all 159 papers equally without assessing their methodological quality. This means results from rigorous peer-reviewed studies are given the same weight as preliminary arXiv preprints, potentially laundering weak results into the survey's synthesis."
    341     },
    342     {
    343       "flag": "No publication bias discussion",
    344       "detail": "The survey does not consider whether its corpus skews toward positive results. LLM-compiler papers with negative findings (approaches that failed) are likely underrepresented in the literature."
    345     },
    346     {
    347       "flag": "No limitations of own methodology",
    348       "detail": "The paper discusses challenges in the field but does not discuss limitations of its own survey approach — e.g., potential selection bias from keyword choices, coverage gaps, or subjectivity in taxonomy assignment."
    349     },
    350     {
    351       "flag": "Uncritical reporting of SOTA evolution",
    352       "detail": "Section 5.2 tracks SOTA progress across tasks but acknowledges that 'direct comparisons are not always perfectly fair' without systematically addressing what this means for the claimed progress narrative. The progression figures may be misleading due to different evaluation setups."
    353     }
    354   ],
    355   "cited_papers": [
    356     {
    357       "title": "Evaluating Large Language Models Trained on Code",
    358       "authors": ["Mark Chen", "Jerry Tworek"],
    359       "year": 2021,
    360       "arxiv_id": "2107.03374",
    361       "relevance": "Introduced HumanEval benchmark and Codex, foundational to LLM code generation evaluation."
    362     },
    363     {
    364       "title": "Large Language Models for Compiler Optimization",
    365       "authors": ["Chris Cummins", "Volker Seeker"],
    366       "year": 2023,
    367       "arxiv_id": "2309.07062",
    368       "relevance": "Pioneered LLM pre-training on LLVM IR for compiler optimization, directly relevant to AI-assisted code optimization."
    369     },
    370     {
    371       "title": "Automated program repair in the era of large pre-trained language models",
    372       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    373       "year": 2023,
    374       "relevance": "First extensive study on applying LLMs directly for automated program repair."
    375     },
    376     {
    377       "title": "WhiteFox: White-box compiler fuzzing empowered by large language models",
    378       "authors": ["Chenyuan Yang"],
    379       "year": 2024,
    380       "relevance": "Multi-agent framework for LLM-based compiler fuzzing, relevant to agentic AI testing."
    381     },
    382     {
    383       "title": "KernelBench: Can LLMs write efficient GPU kernels?",
    384       "authors": ["Anne Ouyang"],
    385       "year": 2025,
    386       "relevance": "Key benchmark for evaluating LLM capabilities in GPU kernel generation and optimization."
    387     },
    388     {
    389       "title": "CUDA-L1: Improving CUDA optimization via contrastive reinforcement learning",
    390       "authors": ["Xiang Li"],
    391       "year": 2025,
    392       "relevance": "Demonstrates RL-based approach to CUDA optimization achieving near-perfect correctness, relevant to AI code optimization."
    393     },
    394     {
    395       "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair",
    396       "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"],
    397       "year": 2023,
    398       "relevance": "LLM-based automated program repair combining completion engines, relevant to AI-assisted debugging."
    399     },
    400     {
    401       "title": "AlphaTrans: A neuro-symbolic compositional approach for repository-level code translation and validation",
    402       "authors": ["Ali Reza Ibrahimzada"],
    403       "year": 2025,
    404       "relevance": "Repository-level LLM-based code transpilation with verification, relevant to scalable AI code transformation."
    405     },
    406     {
    407       "title": "LEGO-Compiler: Enhancing Neural Compilation Through Translation Composability",
    408       "authors": ["Shuoming Zhang", "Jiacheng Zhao"],
    409       "year": 2025,
    410       "arxiv_id": "2505.20356",
    411       "relevance": "Achieved 99% neural compilation accuracy through divide-and-conquer, demonstrating LLM capability for end-to-end compilation."
    412     },
    413     {
    414       "title": "CoTran: An LLM-based code translator using reinforcement learning with feedback from compiler and symbolic execution",
    415       "authors": ["Prithwish Jana"],
    416       "year": 2024,
    417       "relevance": "RL-based feedback mechanism for LLM code translation, relevant to AI-assisted code migration."
    418     }
    419   ]
    420 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs