scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28941B)
      1 {
      2   "paper": {
      3     "title": "KernelBand: Steering LLM-based Kernel Optimization via Hardware-Aware Multi-Armed Bandits",
      4     "authors": [
      5       "Dezhi Ran",
      6       "Shuxiao Xie",
      7       "Mingfang Ji",
      8       "Anmin Liu",
      9       "Mengzhou Wu",
     10       "Yuan Cao",
     11       "Yuzhe Guo",
     12       "Hao Yu",
     13       "Linyi Li",
     14       "Yitao Hu",
     15       "Wei Yang",
     16       "Tao Xie"
     17     ],
     18     "year": 2026,
     19     "venue": "arXiv",
     20     "arxiv_id": "2511.18868"
     21   },
     22   "scan_version": 2,
     23   "active_modules": ["experimental_rigor", "data_leakage"],
     24   "methodology_tags": ["benchmark-eval", "theoretical"],
     25   "key_findings": "KernelBand formulates LLM-based GPU kernel optimization as a multi-armed bandit problem, combining hardware-aware pruning with trace-driven clustering to guide exploration. Evaluated on TritonBench-G across 3 GPU architectures and 4 LLMs, it achieves up to 1.91× geometric mean speedup and 39–140% higher Fast@1 success rates over state-of-the-art baselines. Ablation studies show the structured bandit policy is essential — replacing it with LLM semantic reasoning regresses performance below baseline (0.97×). The framework also delivers 35–50% higher speedup per dollar than unguided approaches.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No repository URL, code archive, or link to released source code is provided anywhere in the paper."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The evaluation uses TritonBench-G (Li et al., 2025b), a publicly available benchmark. The corrected version from GEAK is also referenced as open-source."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Section 4.1 states 'CUDA 12.1 with Triton 3.3.0' and lists GPU hardware, and Section 3.6 mentions scikit-learn KMeans. However, no requirements.txt, Dockerfile, or comprehensive dependency specification is provided."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions, README with commands, or scripts for replicating experiments are mentioned."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All results in Tables 1, 2, and 4 are reported as point estimates (e.g., '1.91×', '79.8%') with no confidence intervals, error bars, or ± notation."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Claims of superiority (e.g., 'outperforming GEAK by 42.5% in speedup') are based solely on comparing point estimates. No p-values, t-tests, or any statistical significance tests are reported."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Effect sizes are consistently reported with baseline context: '42.5% in speedup' (Section 4.2), '1.91× vs 1.34×' (Table 1), '35-50% higher speedup per dollar' (Section 4.4.1). Speedup ratios and relative improvements are provided throughout."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The full benchmark has 183 kernels and the subset has 50 kernels. While the subset sampling strategy is described (stratified, seed=42, Appendix E), there is no justification for why these sample sizes are sufficient for the claims, and no power analysis."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No standard deviations, variance across seeds, or spread measures are reported. Appendix H mentions 'median execution time' for timing, but no variance across independent experimental runs of the optimization pipeline."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Two baselines are compared: GEAK (Wang et al., 2025a), an open-source Triton kernel optimization agent, and Best-of-N (BoN). PyTorch baselines (eager, inductor, max-autotune) are also compared in Appendix G."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "GEAK (2025) is contemporary. Other agent-based methods (STARK, CudaForge, TritonForge) are discussed but their code is unavailable (Appendix F). The paper acknowledges this and explains why only GEAK could be compared."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 4.5 and Table 4 present comprehensive ablations: single-component (w/o Clustering, w/o Profiling, LLM Strategy Selection) and framework-level (w/o Strategy + Raw Prof., w/o Strategy Set, BoN). Appendix J provides detailed descriptions."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Three complementary metrics are used: Correct (%), Fast@1 (%), and Geometric Mean Speedup (in both standard and fallback modes), as defined in Section 4.1."
     96       },
     97       "human_evaluation": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "Human evaluation is irrelevant — the claims are about automated GPU kernel performance (speedup, correctness), which are objectively measured via execution time and functional equivalence checks."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "Hyperparameters (K=3, θsat=75%, c=2.0) appear to be tuned on the same benchmark used for reporting results. Section 4.3.1 tests K ∈ {1,2,3,5} on the 50-kernel subset and selects K=3, then reports results on the same subset. No separate validation/test split."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table 1 provides breakdowns by difficulty level (L1-2, L3, L4-5). Appendix E (Table 7) shows category distribution. Table 10 provides per-strategy breakdowns across hardware platforms."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper reports that KernelBand fails on 20-23% of tasks (correctness 77.8-79.8%) and discusses where baselines fail, but does not analyze specific failure modes of KernelBand itself — no error analysis or qualitative examples of kernels where the method breaks down."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Table 4 shows the LLM Strategy Selection ablation drops to 0.97× (below baseline). K=5 underperforms at 1.54× vs K=3's 1.66×. Raw profiling injection degrades correctness to 43.9%. These are explicitly discussed as negative findings."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims of 'up to 1.91× geometric mean speedup' (Table 1, A100), 'over 33% average improvement' (derived from Table 1 comparisons), and '39-140%' Fast@1 improvement (Table 1 across GPUs) are all supported by the experimental results."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Causal claims are made through ablation studies (Table 4): 'replacing our bandit policy with LLM semantic reasoning regresses performance to 0.97×'. The ablation design uses controlled single-variable manipulation, which is adequate for these causal claims."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The abstract bounds claims to 'TritonBench-G with three GPU architectures and four code LLMs.' The title ('LLM-based Kernel Optimization') matches the scope. Claims are consistently stated relative to the tested configurations."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No substantive discussion of alternative explanations for the results. The paper does not consider whether improvements might be due to higher compute usage, strategy set being hand-tuned for this benchmark, or other confounds beyond the ablation variables."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper measures kernel execution speedup and claims kernel optimization improvement. The measurement (execution time ratios, correctness rates) directly matches the claims. No proxy gap exists."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Table 2 lists 'DeepSeek-V3.2', 'GPT-5', 'Claude Opus 4.5', 'Gemini 3 Flash' — these are marketing names without snapshot dates or API version identifiers. No model checkpoint IDs or API version strings are provided."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Table 6 describes optimization strategies at a high level (e.g., 'Partition computation into configurable tile sizes'). The actual prompt text sent to LLMs is not provided — neither in the paper nor via a repository link."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Table 5 reports Temperature=1.0, Max Output Tokens=16384. Section 3.6 reports K=3, τ=10, θsat=75%, c=2.0. Sensitivity analysis for K is in Section 4.3.1."
    165       },
    166       "scaffolding_described": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The entire KERNELBAND framework is described in detail: Algorithm 1 (workflow), runtime clustering (Section 3.3), hardware-constrained bandit policy (Section 3.4), UCB selection (Eq. 6), and implementation details (Section 3.6)."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Appendix F describes benchmark modifications: adopting GEAK's corrected version, AMD-to-NVIDIA function substitutions, and excluding sin_computation. Appendix E documents the stratified sampling for the 50-kernel subset (seed=42)."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No dedicated limitations section exists. The paper goes from Experiments (Section 4) to Related Work (Section 5) to Conclusion (Section 6) without any limitations or threats-to-validity discussion."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No threats to validity are discussed anywhere in the paper — neither specific nor generic."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings the results do NOT apply to, what hardware or LLM types are excluded, or what claims the authors are not making."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No raw experimental data (per-kernel runtimes, per-iteration optimization traces, LLM outputs) is released or made available for independent verification."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The benchmark (TritonBench-G) is described in Section 4.1 with 183 kernels across 13 categories and 5 difficulty levels. Evaluation protocol including correctness checks (atol=10⁻⁴, rtol=10⁻⁴) and timing methodology is detailed in Appendix H."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. Data source is a standard public benchmark (TritonBench-G)."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The data pipeline is documented: benchmark selection and modification (Appendix F), subset sampling with explicit seed and stratification (Appendix E), correctness verification stages (Call Accuracy then Execution Accuracy), and speedup computation via weighted aggregation (Appendix H)."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding sources, grants, or sponsoring organizations are mentioned anywhere in the paper. No acknowledgments section is present."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are clearly listed: Peking University, East China Normal University, Tianjin University, HKUST, Simon Fraser University, UT Dallas, and several Chinese research centers."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "With no funding disclosure, independence cannot be assessed. The authors are at academic institutions (not at the companies whose LLMs they evaluate), but formal funding independence is not established."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is present in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No training data cutoff dates are stated for any of the four LLMs used (DeepSeek-V3.2, GPT-5, Claude Opus 4.5, Gemini 3 Flash). The LLMs generate kernel code, so training data containing Triton kernel examples could affect performance."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of whether TritonBench-G kernels or their optimized versions appeared in any LLM's training data."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "TritonBench-G is publicly available and could be in training data. The original TritonBench paper was published before the preprint date (Feb 2026). No contamination risk discussion is provided."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Figure 3 breaks down per-iteration latency (129s parallel wall-clock). Figure 4 plots speedup vs. API cost per kernel ($0.50 budget point). Section 4.4.1 discusses cost-effectiveness in detail."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Per-kernel per-iteration costs are stated (129s wall-clock, API costs in Figure 4) and hardware platforms are listed, but total computational budget for the full experimental campaign (total GPU hours, total API spend across all experiments) is not reported."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No results across multiple random seeds are reported. The LLM generation process is stochastic (temperature=1.0), but the full optimization pipeline results appear to be from single runs."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The optimization budget T=20 (or T=40 for scaling) is stated, but how many independent repetitions of the full experiment were conducted is not reported. Appendix H states timing uses '1000ms of timed runs' for benchmarking, but this is for kernel timing, not experimental repetitions."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Section 4.3.1 tests K ∈ {1,2,3,5}, and Section 3.6 states fixed values for other parameters. However, the full hyperparameter search process (how c=2.0 and θsat=75% were selected, total configurations tried) is not reported."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "K=3 is selected in Section 4.3.1 based on results from the same 50-kernel subset used for reporting. No separate validation set is used for configuration selection — the selection and evaluation data overlap."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The paper makes numerous comparisons across methods, GPUs, difficulty levels, and LLM backends without any statistical tests, let alone corrections for multiple comparisons."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors implemented KERNELBAND and adapted GEAK baseline (Appendix F), but do not acknowledge or discuss the bias of evaluating their own system against their adaptation of a competitor."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Figure 4 explicitly plots speedup vs. API cost per kernel for all three methods. Figure 2 shows performance as a function of iteration budget (T). Both demonstrate performance-per-compute tradeoffs."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether TritonBench-G actually measures real-world kernel optimization ability. The paper uses the benchmark without questioning whether its 183 kernels across 13 categories and 5 difficulty levels are representative of practical kernel optimization needs."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "In the LLM generalization experiment (Table 2), all four LLMs use the identical KERNELBAND scaffold, isolating model capability from scaffolding. Baseline comparisons are between different methods (not different models in different scaffolds), so the scaffold IS the variable being tested."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "TritonBench-G was published in 2025 and the models tested may have training data including its kernels. No temporal analysis or discussion of this risk is provided."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the evaluation setup (providing reference kernel code to the LLM) leaks information beyond what would be available in a realistic deployment scenario."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether the 183 TritonBench-G kernels are independent (e.g., kernels from the same category may share structural similarities that inflate apparent generalization)."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No leakage detection or prevention methods (canary strings, membership inference, decontamination) are applied."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "KernelBand achieves up to 1.91× geometric mean speedup on A100, outperforming GEAK by 42.5% in speedup and 66.2% in success rate.",
    377       "evidence": "Table 1: A100 results show KernelBand at 1.91× G, 79.8% C vs GEAK at 1.34× G, 48.0% C.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "KernelBand improves Fast@1 success rate by 39–140% over GEAK across all GPU platforms.",
    382       "evidence": "Table 1: Fast@1 improvements are 43.3% vs 31.1% (RTX 4090, +39%), 57.3% vs 23.8% (H20, +141%), 60.1% vs 38.7% (A100, +55%).",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Replacing the bandit policy with LLM semantic reasoning regresses performance to 0.97× (below the reference kernel).",
    387       "evidence": "Table 4: LLM Strategy Selection ablation shows 0.97× G, 36.6% F, 68.3% C.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "KernelBand delivers 35–50% higher speedup per dollar than unguided approaches.",
    392       "evidence": "Figure 4: At $0.50/kernel, KernelBand achieves 1.83× vs GEAK's 1.35× (+35%) and BoN's 1.22× (+50%).",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "KernelBand automatically adapts optimization strategies to different GPU hardware bottlenecks.",
    397       "evidence": "Table 10 and Section 4.2: Strategy selection frequencies differ between H20 and RTX 4090 (e.g., FUSION 18.5% vs 12.8%, TILING 7.6% vs 10.0%).",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "KernelBand generalizes across LLM backends, with Claude Opus 4.5 achieving 1.82× speedup and 89.8% correctness.",
    402       "evidence": "Table 2: Results across DeepSeek-V3.2, GPT-5, Claude Opus 4.5, and Gemini 3 Flash all show KernelBand outperforming baselines.",
    403       "supported": "moderate"
    404     },
    405     {
    406       "claim": "KernelBand's regret bound depends on the compact covering number of runtime clusters rather than the vast kernel space.",
    407       "evidence": "Theorem 1 (Section 3.5) with proof in Appendix B. Theoretical result, not empirically validated.",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "red_flags": [
    412     {
    413       "flag": "No error bars or uncertainty quantification",
    414       "detail": "All results are point estimates from what appear to be single experimental runs. With temperature=1.0 LLM generation, there should be significant variance across runs, but this is never measured or reported."
    415     },
    416     {
    417       "flag": "No statistical significance tests",
    418       "detail": "Claims of superiority ('outperforming GEAK by 42.5%') are based on comparing raw numbers without any statistical tests. Given the stochastic nature of LLM generation, observed differences could be within noise."
    419     },
    420     {
    421       "flag": "Hyperparameters tuned on test data",
    422       "detail": "K=3 was selected from {1,2,3,5} using the same 50-kernel subset on H20 that is used for reporting results (Section 4.3.1). No separate validation set is used for hyperparameter selection."
    423     },
    424     {
    425       "flag": "Missing baselines due to unavailable code",
    426       "detail": "Three contemporary agent-based methods (STARK, CudaForge, TritonForge) could not be compared because code is unavailable (Appendix F). Only GEAK and the trivial BoN baseline are compared, limiting the strength of state-of-the-art claims."
    427     },
    428     {
    429       "flag": "No limitations section",
    430       "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries, despite having clear limitations (single-run results, limited baselines, benchmark-specific tuning)."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "GEAK: Introducing Triton Kernel AI Agent & Evaluation Benchmarks",
    436       "authors": ["J. Wang", "V. Joshi", "S. Majumder", "X. Chao", "B. Ding", "Z. Liu", "P. P. Brahma", "D. Li", "Z. Liu", "E. Barsoum"],
    437       "year": 2025,
    438       "arxiv_id": "2507.23194",
    439       "relevance": "Primary baseline for LLM-based Triton kernel optimization using iterative refinement agent."
    440     },
    441     {
    442       "title": "STARK: Strategic Team of Agents for Refining Kernels",
    443       "authors": ["J. Dong", "Y. Yang", "T. Liu", "Y. Wang", "F. Qi", "V. Tarokh", "K. Rangadurai", "S. Yang"],
    444       "year": 2025,
    445       "arxiv_id": "2510.16996",
    446       "relevance": "Multi-agent collaboration approach for kernel optimization with grounded instruction and strategic search."
    447     },
    448     {
    449       "title": "CudaForge: An Agent Framework with Hardware Feedback for CUDA Kernel Optimization",
    450       "authors": ["Z. Zhang", "R. Wang", "S. Li", "Y. Luo", "M. Hong", "C. Ding"],
    451       "year": 2025,
    452       "arxiv_id": "2511.01884",
    453       "relevance": "LLM agent framework using Nsight Compute profiling feedback for iterative CUDA kernel optimization."
    454     },
    455     {
    456       "title": "TritonBench: Benchmarking Large Language Model Capabilities for Generating Triton Operators",
    457       "authors": ["J. Li", "S. Li", "Z. Gao", "Q. Shi", "Y. Li", "Z. Wang", "J. Huang", "H. Wang", "J. Wang", "X. Han", "Z. Liu", "M. Sun"],
    458       "year": 2025,
    459       "relevance": "The benchmark used for evaluation — measures LLM ability to generate optimized Triton GPU kernels."
    460     },
    461     {
    462       "title": "Kevin: Multi-turn RL for Generating CUDA Kernels",
    463       "authors": ["C. Baronio", "P. Marsella", "B. Pan", "S. Guo", "S. Alberti"],
    464       "year": 2025,
    465       "arxiv_id": "2507.11948",
    466       "relevance": "Training-based approach using multi-turn reinforcement learning for CUDA kernel generation."
    467     },
    468     {
    469       "title": "TritonRL: Training LLMs to Think and Code Triton without Cheating",
    470       "authors": ["J. Woo", "S. Zhu", "A. Nie", "Z. Jia", "Y. Wang", "Y. Park"],
    471       "year": 2025,
    472       "arxiv_id": "2510.17891",
    473       "relevance": "Combines supervised fine-tuning with RL and hierarchical reward for Triton kernel generation."
    474     },
    475     {
    476       "title": "ConCuR: Conciseness Makes State-of-the-Art Kernel Generation",
    477       "authors": ["L. Kong", "J. Wei", "H. Shen", "H. Wang"],
    478       "year": 2025,
    479       "arxiv_id": "2510.07356",
    480       "relevance": "Pipeline for generating and curating CUDA kernels with reasoning traces for supervised fine-tuning."
    481     },
    482     {
    483       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    484       "authors": ["N. Shinn", "F. Cassano", "E. Berman", "A. Gopinath", "K. Narasimhan", "S. Yao"],
    485       "year": 2023,
    486       "relevance": "Foundational work on LLM agent self-reflection loops, used as a framework-level ablation comparison."
    487     },
    488     {
    489       "title": "GPT-4 Technical Report",
    490       "authors": ["J. Achiam", "S. Adler", "S. Agarwal"],
    491       "year": 2023,
    492       "arxiv_id": "2303.08774",
    493       "relevance": "Technical report for one of the foundational LLMs used in code generation and agent systems."
    494     },
    495     {
    496       "title": "TritonForge: Profiling-Guided Framework for Automated Triton Kernel Optimization",
    497       "authors": ["H. Li", "K. Man", "P. Kanuparthy", "H. Chen", "W. Sun", "S. Tallam", "C. Zhu", "K. Zhu", "Z. Qian"],
    498       "year": 2025,
    499       "arxiv_id": "2512.09196",
    500       "relevance": "Integrates runtime profiling with iterative code transformation for Triton kernel optimization."
    501     },
    502     {
    503       "title": "SwizzlePerf: Hardware-Aware LLMs for GPU Kernel Performance Optimization",
    504       "authors": ["A. Tschand", "M. Awad", "R. Swann", "K. Ramakrishnan", "J. Ma", "K. Lowery", "G. Dasika", "V. J. Reddi"],
    505       "year": 2025,
    506       "arxiv_id": "2508.20258",
    507       "relevance": "Hardware-aware LLM approach for GPU kernel performance optimization, related concurrent work."
    508     },
    509     {
    510       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    511       "authors": ["D. Guo", "Q. Zhu", "D. Yang"],
    512       "year": 2024,
    513       "arxiv_id": "2401.14196",
    514       "relevance": "Code LLM used in the kernel optimization pipeline; relevant to code generation capabilities."
    515     }
    516   ]
    517 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs