scan.json (29558B)
1 { 2 "paper": { 3 "title": "Optimizing PyTorch Inference with LLM-Based Multi-Agent Systems", 4 "authors": [ 5 "Kirill Nagaitsev", 6 "Luka Grbčić", 7 "Samuel Williams", 8 "Costin Iancu" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2511.16964", 13 "doi": "10.48550/arXiv.2511.16964" 14 }, 15 "scan_version": 2, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "The paper presents PIKE, a logical framework for comparing multi-agent LLM-based PyTorch optimization systems. PIKE-B (exploit-heavy, branching search with error-fixing agents) achieves 2.88× geomean speedup over PyTorch Eager on H100 GPU across a refined KernelBench suite, outperforming torch.compile (1.64×), TensorRT (1.41×), and METR (1.40×). Exploit-heavy strategies paired with error-fixing agents consistently outperform explore-heavy strategies, and performance correlates with the granularity (aggressiveness) of optimization steps.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No repository URL or code archive is provided anywhere in the paper. The PIKE implementations are described algorithmically but no source code link is given." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper uses KernelBench (Ouyang et al., 2025), a publicly available benchmark. The METR-refined variant is also publicly available. The authors describe their further filtering (Level 3-pike) in §4.1 and Table 1." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "Hardware is specified (H100 80GB HBM3, Intel Xeon Platinum 8480+, 40+ CPU threads) and the evaluator is containerized with Docker. However, no requirements.txt, Dockerfile, or software version list is provided — PyTorch version, CUDA version, Triton version, etc. are not stated." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No reproduction instructions are provided. The paper describes the algorithm (Algorithm 1) and hyperparameters, but there are no step-by-step instructions or scripts for replicating the experiments." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results are reported as point estimates (e.g., 2.88× speedup). No confidence intervals or error bars are provided for any results, despite the stochastic nature of LLM generation (temperature 0.8)." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "No statistical significance tests are used. Claims like 'PIKE-B outperforms PIKE-O' are based solely on comparing geomean speedup numbers without any statistical testing." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Speedup ratios relative to PyTorch Eager baseline are reported throughout (e.g., 2.88× for PIKE-B, 1.64× for torch.compile). These provide baseline context for the magnitude of improvement." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The benchmark suite contains 30 Level 3-pike tasks and 14 Level 5 tasks. No justification is given for why these sample sizes are sufficient for the claims made." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No variance, standard deviation, or spread measures across runs are reported. Each configuration appears to be run once with the 300-query budget. Given the stochastic LLM (temperature 0.8), repeated runs would likely yield different results." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Multiple baselines are included: PyTorch Eager (reference), torch.compile with max autotuning, TensorRT, and METR solutions (§4.2). Results are shown in Tables 3-4 and Figures 3-5." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "METR (2025) represents the state of the art for LLM-based kernel optimization. TorchInductor and TensorRT are current production compilers. All baselines are contemporary." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Extensive ablations are performed: PIKE-B (no EFA, no IBA, cheap EFA) in §5.2.1, and PIKE-O variants progressively modified toward PIKE-B (mut, npar, 1isl, EO, SL) in §5.2.2. Figure 5 summarizes all ablations." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Results are reported using speedup (geomean), LLM query count, monetary cost per task, SLOC, mean error-fix attempts, LoC changed per step, and cosine similarity between optimization steps (§4.5-4.7, Table 2)." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "Human evaluation is irrelevant to the claims. The paper evaluates GPU kernel optimization quality through automated correctness checks and runtime measurements." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "PIKE-B hyperparameters were selected via 'initial tuning' (§4.4) but no validation/test split is described. It is unclear whether tuning was done on the same benchmark tasks used for final evaluation." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Tables 3 and 4 provide per-task speedup breakdowns for all 30 Level 3-pike and 14 Level 5 tasks across all methods. Appendix B provides detailed analysis of individual task results." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Appendix B discusses task removals (LSTM/GRU, §B.1.1), MLP noise issues (§B.1.2), invalid solutions (§B.3.6 — invalid METR DeepSeek3MLA solution), and challenging CUDA techniques (§B.4). Tasks where PIKE underperforms are visible in per-task tables." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Several negative results: PIKE-O (mut,npar) drops to 1.99 when parallelism is reduced (§5.2.2); PIKE-O default does not benefit from EFA (2.17 vs 2.15, §5.2.2); crossover vs mutation shows no improvement (PIKE-O mut same as default at 2.15); PIKE-B without EFA drops to 1.98 (§5.2.1)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims: (1) 2.88× speedup on H100 — confirmed in Table 3 geomean; (2) exploit-heavy strategies perform best with error-fixing — confirmed across §5.1-5.2; (3) performance correlates with step granularity — supported by Figures 6-7 and Table 2." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims are made via controlled ablation: removing EFA from PIKE-B drops performance from 2.88 to 1.98 (§5.2.1); progressive PIKE-O ablations isolate the effect of each component change (§5.2.2). Single-variable manipulations are adequate for these claims." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "The abstract specifies 'on an H100 GPU across diverse tasks in KernelBench.' The paper uses a single LLM (Gemini 2.5 Pro) and a single GPU type, and does not claim general applicability beyond this setting. Results are presented specifically for KernelBench Level 3-pike and Level 5." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper does not consider alternative explanations for its findings. For example, the PIKE vs METR comparison confounds the multi-agent strategy with the model choice (Gemini 2.5 Pro vs o1/Claude/GPT-4o). No discussion of whether Gemini 2.5 Pro's specific capabilities drive the results rather than the search strategy." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper measures speedup relative to PyTorch Eager on specific benchmark tasks and frames results as 'inference speedup.' The measurement matches the claim granularity — no proxy gap exists between what is measured and what is claimed." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper uses 'Gemini 2.5 Pro' and 'Gemini 2.5 Flash' (§4.5) — marketing names without snapshot dates or API versions. Also mentions 'OpenAI's text-embedding-3-large' (§4.7) without a version. Per the schema, marketing names without API versions do not count." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "Agent roles are described conceptually (IBA generates ideas, COA optimizes code, EFA fixes errors) but no actual prompt text is provided for any agent. The reader cannot reconstruct the prompts sent to the LLM." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Detailed hyperparameters in §4.4: temperature 0.8, PIKE-B (n=10, m=5, k=4), PIKE-O (population=25, archive=12, islands=3, explore=0.2, exploit=0.7, parallel evaluations=10). Correctness tolerance in Appendix A (atol=0.01, rtol=0.01)." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "The multi-agent framework is described in detail in §2-3: library management, seed selection with explore/exploit ratios, mutation vs crossover, error-fixing loops, island-based genetic algorithms. Figure 1 shows the workflow, Algorithm 1 provides PIKE-B pseudocode." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Benchmark filtering is documented: §4.1 describes removing Levels 1-2, filtering LSTM/GRU from Level 3, using METR's Level 5. Table 1 summarizes the final suite. Appendix B.1 details specific task removals with rationale." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "No dedicated limitations section exists. Budget constraints are mentioned in passing ('Due to the high cost of experimentation, we were unable to conduct a complete hyperparameter sweep' in §4.4) but there is no substantive limitations discussion." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "No threats to validity are discussed. The paper does not address threats like single-GPU generalizability, single-model dependence, lack of repeated runs, or benchmark representativeness." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show — e.g., whether findings transfer to other GPUs, other LLMs, other optimization targets, or production workloads beyond KernelBench." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "No raw data is released. Generated solutions, LLM responses, evaluation logs, and timing measurements are not made available for independent verification." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "The evaluation procedure is well-described: Docker-containerized evaluation, Triton's do_bench for timing with warmup and synchronization, torch.allclose correctness checks with specified tolerances (§4.3, §4.6, Appendix A)." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data comes from KernelBench, a standard public benchmark." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline from benchmark selection through filtering, evaluation, correctness checking, timing, and metric computation is documented across §4.1-4.6. Task removal rationale is provided in Appendix B.1." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Funding is disclosed in the Acknowledgements: U.S. Department of Energy, Office of Science (DE-FOA-0003264, FP00018807, DE-AC02-05CH11231) and DOE Computational Science Graduate Fellowship (DE-SC0024386)." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are listed: Northwestern University and Lawrence Berkeley National Laboratory. The authors are not evaluating a product from their own employer." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "The U.S. Department of Energy funds basic scientific computing research and has no financial interest in the specific outcome of PIKE vs other optimization methods." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "The training cutoff for Gemini 2.5 Pro/Flash is not stated. The LLM is generating code optimizations for benchmark tasks, and its training data could include prior solutions." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether Gemini 2.5 Pro was trained on KernelBench tasks, prior solutions, or optimized PyTorch code that could inform its generations." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "KernelBench was published in early 2025. Gemini 2.5 Pro could have been trained on the benchmark tasks or prior solutions. This contamination risk is not addressed." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": true, 289 "justification": "Detailed cost analysis in §4.5 and Figures 3b/4b: $25-50 per task for 300 queries, mean cost $50.96 (PIKE-B) vs $39.59 (PIKE-O) per task (Table 2). Gemini API pricing cited. Cost-performance tradeoffs analyzed." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": true, 294 "justification": "Compute budget clearly stated: 300 LLM queries per task, H100 GPU with 80GB HBM3, 40+ CPU threads, parallel CUDA/Triton compilations. Total suite cost: $25 × 30 = $750 for Level 3-pike at $25/task budget (§4.5)." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "Results are not reported across multiple random seeds. Despite using LLM temperature 0.8, each configuration appears to be run once. No analysis of how results vary across independent optimization runs." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The budget is 300 queries per task, but the number of independent optimization runs is not explicitly stated. It appears each configuration was run once. The comparison with METR notes 'our single 300-query runs are compared against the best results from 4 or more METR runs.'" 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "PIKE-B parameters were selected via 'initial tuning' (§4.4) but no search budget is reported — how many configurations were tried, what method was used, or what compute was spent on tuning." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "The systematic PIKE-O ablation (§5.2.2) traces configuration choices through progressive modifications (mut → npar → 1isl → EO → SL), showing the effect of each change. PIKE-B ablation (§5.2.1) demonstrates the contribution of EFA and IBA." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "Many pairwise comparisons are made across configurations, levels, and budgets without any statistical tests or multiple comparison corrections." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors implement their own PIKE variants and compare against METR (whose framework is not open-source). They run METR's best solutions on their hardware (fair), but do not acknowledge or discuss self-comparison bias per Lucic et al. (2018)." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": true, 331 "justification": "Figures 3 and 4 explicitly plot performance (geomean speedup) as a function of both LLM queries per task and cost per task. This allows direct comparison at matched budgets." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": true, 336 "justification": "The paper critically evaluates benchmark quality: removing LSTM/GRU tasks due to pathological baselines (§B.1.1), identifying MLP noise (§B.1.2), documenting invalid METR solutions (§B.3.6), and creating a refined Level 3-pike suite. This shows active engagement with construct validity." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "The comparison against METR confounds the multi-agent strategy with the LLM choice: PIKE uses Gemini 2.5 Pro while METR uses o1, Claude 3.5 Sonnet, gpt-4o, and o3-mini-high. The paper does not control for or discuss this confound." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "Not discussed. KernelBench tasks are based on well-known PyTorch architectures (ResNet, DenseNet, MinGPT, etc.) that have been extensively discussed online. The LLM could have seen optimized versions in training." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Not discussed. The evaluation gives the LLM full access to the PyTorch source code, which is the intended setup, but no analysis of whether prior knowledge from training influences generations." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "Not discussed. Many Level 3-pike tasks share structural similarities (multiple DenseNet variants, multiple EfficientNet variants, etc.), and solutions to one task could inform another. This non-independence is not addressed." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference tests, or decontamination analysis." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "PIKE-B achieves an average 2.88× speedup on H100 GPU across Level 3-pike KernelBench tasks", 370 "evidence": "Table 3 shows per-task speedups with geomean 2.88×. Figure 5(a) confirms this is the best-performing configuration at 300-query budget.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Exploit-heavy strategies outperform explore-heavy strategies for PyTorch optimization", 375 "evidence": "PIKE-B (exploit-heavy, 2.88×) outperforms default PIKE-O (explore-heavy, 2.17×). Progressive PIKE-O ablation toward exploitation improves from 2.17 to 2.81 (§5.2.2, Figure 5). Consistent across both Level 3-pike and Level 5.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Error-fixing agents (EFA) are critical for exploit-heavy strategies but not for explore-heavy ones", 380 "evidence": "PIKE-B drops from 2.88 to 1.98 without EFA (§5.2.1). PIKE-O changes negligibly from 2.17 to 2.15 without EFA (§5.2.2). Explained by PIKE-B producing riskier, larger code changes that need more fixing (Table 2, Figure 6).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Performance correlates with optimization step granularity — larger steps yield better results", 385 "evidence": "Table 2: PIKE-B mean SLOC 244 vs PIKE-O 169. Figures 6-7 show PIKE-B changes more LoC per step. PIKE-B achieves higher speedup despite fewer total steps (160 vs 198). Correlation between step size and performance across PIKE-O variants also shown.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "PIKE-B with cheap EFA (Gemini 2.5 Flash) provides the best cost-performance tradeoff at $25/task", 390 "evidence": "Figure 3(b) shows PIKE-B cheap EFA achieves 2.51× at $25/task vs 2.31× for standard PIKE-B and 2.33× for PIKE-O variant. Mean EFA query cost drops from $0.15 to $0.04 (§5.2.1).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "PIKE outperforms torch.compile, TensorRT, and METR across both benchmark levels", 395 "evidence": "Level 3-pike (Table 3, Figure 5a): PIKE-B 2.88× vs torch.compile 1.64×, TensorRT 1.41×, METR 1.40×. Level 5 (Table 4, Figure 5b): PIKE-B 2.57× vs torch.compile 1.29×, TensorRT 1.25×, METR 1.50×.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No variance or error bars on a stochastic system", 402 "detail": "All results are single-run point estimates despite using LLM temperature 0.8. The evolutionary search process is inherently stochastic, yet no repeated runs, confidence intervals, or variance measures are reported. Geomean speedup differences of 0.07 (e.g., PIKE-B 2.88 vs PIKE-O variant 2.81) could easily fall within run-to-run variance." 403 }, 404 { 405 "flag": "Confounded METR comparison", 406 "detail": "PIKE uses Gemini 2.5 Pro exclusively while METR uses o1, Claude 3.5 Sonnet, gpt-4o, and o3-mini-high. The paper attributes PIKE's advantage to the multi-agent strategy, but the model choice is an uncontrolled confound. Additionally, METR's 'best of 4+ runs' is compared against PIKE's single run, which could either advantage or disadvantage PIKE." 407 }, 408 { 409 "flag": "No contamination analysis for benchmark tasks", 410 "detail": "KernelBench tasks are based on well-known architectures (ResNet, DenseNet, MinGPT, etc.) with extensive online optimization discussions. Gemini 2.5 Pro's training data likely includes optimized CUDA/Triton implementations for these architectures, yet no contamination analysis is performed." 411 }, 412 { 413 "flag": "No code release for reproduction", 414 "detail": "Neither the PIKE framework code nor generated solutions are released. The paper describes algorithms at a high level but key implementation details (actual prompts, evaluation harness specifics, filtering logic) are missing. Full reproduction is impossible." 415 }, 416 { 417 "flag": "No limitations section", 418 "detail": "The paper has no limitations or threats-to-validity discussion. Significant limitations — single GPU type, single LLM, single benchmark, single run, no prompt release, incomplete hyperparameter search — are not acknowledged." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "KernelBench: Can LLMs Write Efficient GPU Kernels?", 424 "authors": ["Alex Ouyang", "Simon Guo", "Simran Arora", "Alex L. Zhang", "William Hu", "Christopher Ré", "Azalia Mirhoseini"], 425 "year": 2025, 426 "arxiv_id": "2502.10517", 427 "relevance": "The primary benchmark used for evaluation; defines the LLM-generated GPU kernel task." 428 }, 429 { 430 "title": "Measuring Automated Kernel Engineering", 431 "authors": ["METR"], 432 "year": 2025, 433 "relevance": "Refined KernelBench with Level 5 tasks and provides the primary baseline for LLM-based kernel optimization comparison." 434 }, 435 { 436 "title": "AlphaEvolve: A Coding Agent for Scientific and Algorithmic Discovery", 437 "authors": ["Alexander Novikov", "Ngân Vũ", "Marvin Eisenberger"], 438 "year": 2025, 439 "arxiv_id": "2506.13131", 440 "relevance": "LLM-based code evolution framework that inspired OpenEvolve and PIKE-O's evolutionary approach." 441 }, 442 { 443 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 444 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 445 "year": 2023, 446 "arxiv_id": "2310.06770", 447 "relevance": "Influential benchmark for LLM-based software engineering agents, cited as related evaluation framework." 448 }, 449 { 450 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations", 451 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 452 "year": 2024, 453 "relevance": "Multi-agent LLM framework combining outputs from different models; contextualizes PIKE in the broader multi-agent landscape." 454 }, 455 { 456 "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework", 457 "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"], 458 "year": 2024, 459 "relevance": "Multi-agent framework splitting work among specialized agents for coding tasks; related approach to PIKE's agent specialization." 460 }, 461 { 462 "title": "The AI CUDA Engineer: Agentic CUDA Kernel Discovery, Optimization and Composition", 463 "authors": ["Robert Tjarko Lange", "Aaditya Prasad", "Qi Sun"], 464 "year": 2025, 465 "relevance": "Agent-driven CUDA kernel optimization system using evolutionary search and LLM verification." 466 }, 467 { 468 "title": "CUDA-L1: Improving CUDA Optimization via Contrastive Reinforcement Learning", 469 "authors": ["Xuan Li", "Xiaoyu Sun", "Anjiang Wang"], 470 "year": 2025, 471 "arxiv_id": "2507.14111", 472 "relevance": "RL-based CUDA optimization with runtime feedback, alternative approach to LLM-based kernel generation." 473 }, 474 { 475 "title": "Astra: A Multi-Agent System for GPU Kernel Performance Optimization", 476 "authors": ["Andrew Wei", "Tianle Sun", "Yuvanesh Seenichamy"], 477 "year": 2025, 478 "arxiv_id": "2509.07506", 479 "relevance": "Multi-agent system for production SGLang kernel optimization, coordinating generation, testing, profiling, and planning agents." 480 }, 481 { 482 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 483 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 484 "year": 2024, 485 "relevance": "Cost-effective LLM cascade strategy relevant to PIKE's cost-performance analysis." 486 }, 487 { 488 "title": "Kevin: Multi-Turn RL for Generating CUDA Kernels", 489 "authors": ["Cristian Baronio", "Pietro Marsella", "Bo Pan"], 490 "year": 2025, 491 "arxiv_id": "2507.11948", 492 "relevance": "Multi-turn RL CUDA optimizer balancing correctness and performance, exceeding frontier models." 493 }, 494 { 495 "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation", 496 "authors": ["Qian Huang", "Jian Vora", "Percy Liang", "Jure Leskovec"], 497 "year": 2024, 498 "relevance": "Benchmark for evaluating LLM agents on ML tasks; part of the broader agent evaluation landscape." 499 } 500 ] 501 }