scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20692B)
      1 {
      2   "paper": {
      3     "title": "CudaForge: An Agent Framework with Hardware Feedback for CUDA Kernel Optimization",
      4     "authors": ["Zijian Zhang", "Rong Wang", "Shiyang Li", "Yuebo Luo", "Mingyi Hong", "Caiwen Ding"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2511.01884"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub link provided in abstract: https://github.com/OptimAI-Lab/CudaForge"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "They use the publicly available KernelBench benchmark (250 tasks, Levels 1-3). The benchmark is publicly accessible."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup with library versions is described in the paper. GPUs are mentioned but software dependencies are not specified."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself does not contain a 'Reproducing Results' section or specific commands."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., 97.6% correctness, 1.677x speedup) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CudaForge outperforms baselines based solely on comparing numbers without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported with baseline context throughout, e.g., '1.677x speedup over PyTorch baselines' and comparisons like Kevin-32B's 1.10x vs CudaForge's 1.662x. Percentage improvements and absolute speedup values are consistently given."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The stratified subset D* uses only 25 tasks but no justification is given for why 25 tasks is sufficient. No power analysis is discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported across runs. Results appear to be from single runs per configuration."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are included: OpenAI-o3 (one-shot), o3-self-refine, o3-correction, o3-optimization, Kevin-32B, Agentic Baseline, and CudaForge(full metrics)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include Kevin-32B (2025), Agentic Baseline (2025), and OpenAI-o3 (2025), all contemporary methods."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Comprehensive ablation studies in Section 3.6: o3-self-refine (no Judge), o3-correction (correction-only), o3-optimization (optimization-only), full metrics vs. subset metrics. Section 3.7 covers GPU and base model variations."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Five metrics are used: Correctness, Performance (average speedup), Fast1, Median speedup, and 75th percentile speedup."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the generated kernels' quality beyond automated correctness checking. The paper mentions manually checking for 'fake kernels' in CUDA-L1 but this is not a systematic human evaluation of CudaForge outputs."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The NCU metric selection procedure uses representative tasks (Conv2D, MatMul) from KernelBench, then the full system is evaluated on KernelBench. There is no discussion of whether metric selection tasks overlap with evaluation tasks."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-level breakdowns (Level 1, 2, 3) across all metrics. Table 5 provides per-model breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4 (Case study) shows both optimization and correction (failure/repair) rounds. The case study in Figure 8 includes a correction round where a kernel failed numerical checks."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that using full NCU metrics leads to worse performance than the subset (Table 1), and that some model combinations (e.g., QwQ/O3) achieve only 0.790x average speedup (below 1x). Claude-Sonnet-4 as Coder achieves only 88% correctness."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 97.6% correctness, 1.68x speedup, 2.27x scaling up, generalization across GPUs and base models, and $0.3 cost are all supported by Tables 1-5 and Section 3.5."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about the importance of hardware feedback, Judge role separation, and metric selection are supported by controlled ablation studies (Section 3.6) that isolate individual components."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and abstract claim 'CUDA Kernel Optimization' broadly, but results are limited to KernelBench tasks (250 tasks of specific types). The paper does not explicitly bound claims to this benchmark setting. Claims like 'cost-effective, generalizable, and high-performance CUDA kernel optimization' extend beyond what was tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the results. For example, the performance gains could partly be due to the choice of base model (o3) rather than the framework design. No threats-to-validity or confound analysis is presented."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'OpenAI-o3', 'GPT-5', 'Claude-Sonnet-4', 'GPT-OSS-120B', 'QwQ-32B' without specifying API versions or snapshot dates. These are marketing names without version identifiers."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompts for the Coder (initial round, correction, optimization) and Judge (correction, optimization) are provided in Appendix A with actual text, including output format specifications and rules."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No temperature, top-p, max tokens, or other LLM API sampling parameters are reported. Only N=10 maximum iteration rounds and tolerance 1e-4 are stated."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agentic scaffolding is described in detail: two-agent Coder/Judge architecture, iterative workflow, lightweight memory design (no conversation history), correction vs. optimization modes, NCU profiling integration, and JSON-structured feedback (Section 2)."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The NCU metric selection pipeline is documented in detail (Algorithms 1-2): kernel sampling, Top-20 per-task selection via Pearson correlation, cross-task consolidation with P75 threshold. The stratified subset D* construction is described in Section 3.1 and Appendix D.2."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated Limitations or Threats to Validity section in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. No explicit boundary statements about generalization limits."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data (per-task speedups, generated kernels, NCU profiles) is released for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described: KernelBench tasks (Levels 1-3, 250 tasks), stratified random sampling for D* (25 tasks), NCU profiling procedure, and correctness test criteria (Section 3.1)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; the study uses a standard public benchmark (KernelBench)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline from task input to final kernel selection is documented: Coder generates kernel, compilation test, execution test with 1e-4 tolerance, NCU profiling, Judge feedback, iterative refinement, best correct kernel selection (Section 2)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are affiliated with University of Minnesota, Twin Cities, clearly stated on the first page."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is itself a concern."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses OpenAI-o3, GPT-5, Claude-Sonnet-4, etc. without stating their training data cutoff dates. KernelBench was published in 2025 and these models may have seen it."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether KernelBench tasks or their solutions appeared in the training data of the models used."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "KernelBench was published on arXiv in Feb 2025. The models used (o3, GPT-5) may have been trained on data including KernelBench. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 3 reports API cost ($0.30 average per kernel) and wall-clock time (26.5 minutes average) broken down by level. Figure 6 shows cost-performance tradeoff curves."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Total compute is implicitly stated: 250 tasks at ~26.5 min each on a single RTX 6000, with ~$0.30 API cost per kernel. Comparison with Agentic Baseline (6 H100 hours, $5 per kernel) is provided."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CudaForge achieves 97.6% correctness and 1.68x average speedup over PyTorch baselines on KernelBench Levels 1-3.",
    286       "evidence": "Table 1 shows 97.6% correctness and 1.677x performance on 250 KernelBench tasks (Section 3.3).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CudaForge outperforms Kevin-32B (RL-based) with 98.0% correctness and 1.662x speedup vs. Kevin's 82.0% and 1.10x on comparable tasks.",
    291       "evidence": "Section 3.4 and Figure 5 compare CudaForge on H200 against Kevin-32B's reported results on Levels 1-2.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Scaling up maximum iteration rounds to 30 improves CudaForge's performance to 2.27x average speedup.",
    296       "evidence": "Figure 7 and Table 1 (CudaForge-Scaling Up) show 2.265x on the D* subset (25 tasks). Section 3.7.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "CudaForge generalizes across GPUs (A100, RTX 6000, 4090, 3090) and base models (o3, GPT-5, Claude-Sonnet-4, GPT-OSS-120B, QwQ-32B).",
    301       "evidence": "Table 4 shows consistent results across 4 GPUs. Table 5 shows results with various model combinations. All evaluated on D* (25 tasks).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "CudaForge is significantly cheaper than the Agentic Baseline: $0.3 vs $5 API cost and 26.5 min vs 60 min per kernel.",
    306       "evidence": "Table 3 reports costs. However, the Agentic Baseline costs are taken from their paper and run on different hardware (H100).",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Using a curated subset of 24 NCU metrics is more effective than using all NCU metrics.",
    311       "evidence": "Table 1 compares CudaForge vs CudaForge(full metrics) on D*: 1.767x vs 1.414x. Case study in Appendix B.1.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "CudaForge is a training-free multi-agent framework for CUDA kernel generation that uses a Coder and Judge with hardware feedback from Nsight Compute metrics. On KernelBench (250 tasks), it achieves 97.6% correctness and 1.68x average speedup over PyTorch, outperforming RL-based methods (Kevin-32B) and direct LLM generation. The framework demonstrates generalization across GPU architectures and base models, with inference-time scaling showing continued improvement up to 30 iterations. Cost is reported at ~$0.30 and 26.5 minutes per kernel on a single RTX 6000.",
    317   "red_flags": [
    318     {
    319       "flag": "No statistical uncertainty quantification",
    320       "detail": "All results are single-run point estimates with no error bars, confidence intervals, or variance across runs. Given LLM output stochasticity, results could vary significantly across runs."
    321     },
    322     {
    323       "flag": "No limitations section",
    324       "detail": "The paper has no limitations, threats to validity, or scope boundary discussion, despite making broad generalization claims."
    325     },
    326     {
    327       "flag": "Unfair baseline comparison conditions",
    328       "detail": "Kevin-32B comparison uses results from their paper on different hardware/benchmark. Agentic Baseline comparison uses their reported costs on H100 vs CudaForge on RTX 6000. These are not fully controlled comparisons."
    329     },
    330     {
    331       "flag": "Small subset for key experiments",
    332       "detail": "Many ablation and generalization experiments (GPU cross-testing, model combinations, scaling) are run on D* with only 25 tasks, which may not be representative. No justification for this sample size."
    333     },
    334     {
    335       "flag": "Best-of-N evaluation protocol",
    336       "detail": "The paper reports 'the best-performing correct kernel among all candidates for each task,' which inflates reported performance compared to expected single-run behavior."
    337     },
    338     {
    339       "flag": "No contamination analysis",
    340       "detail": "KernelBench was published before the models were likely trained. No discussion of whether models may have memorized benchmark solutions."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Kevin: Multi-turn RL for Generating CUDA Kernels",
    346       "authors": ["Carlo Baronio", "Pietro Marsella", "Ben Pan", "Simon Guo", "Silas Alberti"],
    347       "year": 2025,
    348       "arxiv_id": "2507.11948",
    349       "relevance": "RL-based approach for CUDA kernel generation, direct baseline comparison for CudaForge."
    350     },
    351     {
    352       "title": "Towards Robust Agentic CUDA Kernel Benchmarking, Verification, and Optimization",
    353       "authors": ["Robert Tjarko Lange", "Qi Sun", "Aaditya Prasad", "Maxence Faldor", "Yujin Tang", "David Ha"],
    354       "year": 2025,
    355       "arxiv_id": "2509.14279",
    356       "relevance": "Multi-agent agentic baseline for CUDA kernel optimization, key comparison point."
    357     },
    358     {
    359       "title": "KernelBench: Can LLMs Write Efficient GPU Kernels?",
    360       "authors": ["Anne Ouyang", "Simon Guo", "Simran Arora", "Alex L. Zhang", "William Hu", "Christopher Ré", "Azalia Mirhoseini"],
    361       "year": 2025,
    362       "arxiv_id": "2502.10517",
    363       "relevance": "The primary benchmark used for evaluation; assesses LLM capability for CUDA code generation."
    364     },
    365     {
    366       "title": "A Survey on Code Generation with LLM-based Agents",
    367       "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian"],
    368       "year": 2025,
    369       "arxiv_id": "2508.00083",
    370       "relevance": "Survey of LLM-based agents for code generation, relevant to agentic programming survey scope."
    371     },
    372     {
    373       "title": "A Survey on Large Language Models for Code Generation",
    374       "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen", "Sungju Kim", "Sunghun Kim"],
    375       "year": 2024,
    376       "arxiv_id": "2406.00515",
    377       "relevance": "Survey of LLMs for code generation, contextualizes LLM code capabilities."
    378     },
    379     {
    380       "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors",
    381       "authors": ["Weize Chen"],
    382       "year": 2023,
    383       "arxiv_id": "2308.10848",
    384       "relevance": "Multi-agent collaboration framework that inspired CudaForge's prompt design."
    385     },
    386     {
    387       "title": "CUDA-L1: Improving CUDA Optimization via Contrastive Reinforcement Learning",
    388       "authors": ["DeepReinforce Team"],
    389       "year": 2025,
    390       "relevance": "RL-based CUDA optimization approach whose 'fake kernel' problem is exposed by CudaForge authors."
    391     },
    392     {
    393       "title": "Agent S: An Open Agentic Framework that Uses Computers Like a Human",
    394       "authors": ["Saaket Agashe"],
    395       "year": 2024,
    396       "arxiv_id": "2410.08164",
    397       "relevance": "Agentic framework for computer interaction, inspired CudaForge's agent prompt design."
    398     }
    399   ]
    400 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs