scan.json (32346B)
1 { 2 "paper": { 3 "title": "SysLLMatic: Large Language Models are Software System Optimizers", 4 "authors": [ 5 "Huiyun Peng", 6 "Arjun Gupte", 7 "Ryan Hasler", 8 "Nicholas John Eliopoulos", 9 "Chien-Chou Ho", 10 "Rishi Mantri", 11 "Leo Deng", 12 "Konstantin Läufer", 13 "George K. Thiruvathukal", 14 "James C. Davis" 15 ], 16 "year": 2025, 17 "venue": "arXiv", 18 "arxiv_id": "2506.01249", 19 "doi": "10.48550/arXiv.2506.01249" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "methodology_tags": ["benchmark-eval"], 24 "key_findings": "SysLLMatic, an LLM-based code optimization system integrating a catalog of 43 performance patterns with profiling-guided feedback, achieves average 1.5× latency and 1.76× throughput improvements on five DaCapo real-world Java applications, outperforming compiler baselines on those. However, compiler baselines remain competitive or superior on microbenchmarks (HumanEval_CPP) and some DaCapo applications (Fop, ZXing). Open-source LLMs are competitive on microbenchmarks but fall short on large-scale applications. The BioJava parallelization result (3.44× latency, 4.63× throughput) dominates the DaCapo averages while other applications show modest or no improvement.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "§11 states 'Our artifact is: https://github.com/sysllmatic/sysllmatic.' A public URL is provided." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "All three benchmarks are publicly available: HumanEval-X, SciMark2, and DaCapo. The paper uses unmodified public benchmarks and provides an artifact repository." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "§6.7 describes hardware (Intel Xeon W-2295, 36 CPUs, 188 GB RAM; NVIDIA H100 GPUs) and mentions tools (ANTLR4, async-profiler, perf, JaCoCo, Lizard, ollama), but no requirements.txt, Dockerfile, or detailed dependency version listing is provided in the paper." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper describes the system architecture and methodology in detail (§5, Algorithm 1) but does not include step-by-step reproduction instructions. The prompt excerpts in §5.6.1 are explicitly shortened: 'These excerpts preserve the key instructions and style of the full prompts while omitting implementation-specific details.'" 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "Tables 8 and 9 report only point estimates (e.g., '1.55×', '88.41%'). Despite performing 5 measured runs per program (§6.4.1), no confidence intervals or error bars are reported on those averages." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper claims SysLLMatic 'outperforms' baselines based solely on comparing point estimates in Tables 8 and 9. No statistical significance tests (t-tests, Wilcoxon, etc.) are reported." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Relative improvement ratios are consistently reported with baseline context throughout (e.g., '3.44× latency', '1.5× in latency vs. 1.01× for the compiler' in the abstract, Table 8). These provide clear effect size magnitudes." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "Only 5 of 22 DaCapo applications were selected (§6.2.2) based on practical constraints (Maven build, test suite availability, file size limits), but no justification for this sample size being sufficient for general claims is provided. No power analysis." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "§6.4.1 states '2 warm-up runs followed by 5 measured runs for each program, reporting the average,' but no standard deviation, IQR, or any spread measure across those 5 runs is reported anywhere." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "Multiple baselines are included: PerfCodeGen (SOTA LLM-based optimizer), gcc -O3 compiler optimization, OpenTuner (for HumanEval_CPP), and JIT compiler flags (for Java). See §6.4.2 and Table 8." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "PerfCodeGen (2024) is described as SOTA at time of experiments (§6.4.2). Compiler baselines (gcc, JIT) are standard and actively maintained. The paper justifies excluding other large-scale LLM baselines due to incompatibility (§6.4.2)." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": true, 89 "justification": "§6.5 and Table 11 present a thorough component-wise ablation (Base → +Evaluator → +Context → +Advisor) across all three benchmarks, plus system ablation varying feedback iterations (Figure 12) and hotspot budget K (Figure 14)." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": true, 94 "justification": "Five performance metrics (latency, memory, CPU cycles, throughput, energy) plus correctness, maintainability metrics (CCN, function count, NLOC, tokens), and comparative metrics (Relative Improvement, %Opt). See Table 5." 95 }, 96 "human_evaluation": { 97 "applies": true, 98 "answer": true, 99 "justification": "§7.1.4 describes manual inspection of all optimized outputs: 'To verify that performance gains were legitimate, we manually reviewed all outputs that passed automated tests.' §7.1.3 reports a manual qualitative analysis of 30 Advisor outputs with 100% hypothesis-recommendation alignment." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "The three benchmarks (HumanEval_CPP, SciMark2, DaCapo) serve as independent evaluation suites with their own test cases. No tuning was performed on the test sets; model hyperparameters (temperature 0.7) were fixed following prior work." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Table 8 provides per-application DaCapo results. Tables 8-9 break down results per benchmark. Figure 9 shows per-optimization-pattern distribution. Table 11 provides per-component ablation. Per-metric breakdowns throughout." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "§7.1.4 discusses correctness failures: 1 HumanEval program had its core function removed but passed tests, 2 PMD files mismatched ground truth. §7.2.1 notes Fop and ZXing showed minimal improvement. §8 discusses scaling challenges and correctness limitations." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "CPU regression in BioJava (0.48×, Table 8). Compiler outperforming SysLLMatic on Fop and ZXing. Open-source models degrading performance on DaCapo (DeepSeek-r1 at 0.90× latency, Table 9). Advisor causing regressions on microbenchmarks in ablation (Table 11). Llama4 at 0% SciMark2 correctness." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "Abstract claims of '1.5× in latency' and '1.76× in throughput' on DaCapo are supported by Table 8 (averaging BioJava 3.44×, Fop 1.02×, PMD 1.11×, Graphchi 0.99×, ZXing 0.95× for latency). The claim 'consistently outperforms state-of-the-art LLM baselines on microbenchmarks' is supported by Tables 8-9." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper makes causal claims about component contributions (e.g., 'incorporating the optimization pattern catalog leads to improvements'). These are supported by controlled ablation studies (Table 11) with single-variable manipulation. The causal mechanism for BioJava's improvement (parallelization) is demonstrated with before/after code (Figure 8)." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": false, 136 "justification": "The title 'Large Language Models are Software System Optimizers' is a broad claim. The abstract states the work 'provides a foundation for generating optimized code across various languages, benchmarks, and program sizes.' However, evaluation covers only C++ and Java, 3 benchmarks, and 5 DaCapo apps. While §9 acknowledges generalization limitations, the title and framing significantly overshoot the tested scope." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": true, 141 "justification": "§9 discusses specific alternative explanations: temperature-induced nondeterminism, hardware-level measurement variance. §8.1.5 discusses whether CPU-based profiling captures the right bottlenecks. §8.2.1 discusses test coverage limitations as alternative explanations for apparent correctness." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper directly measures the performance metrics it claims (latency, throughput, energy, etc.) through runtime execution. §6.3 precisely defines each metric and its measurement implementation. The gap between measurement and claim is small — they measure performance and claim performance improvement." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": false, 153 "justification": "§6.6 specifies 'GPT-4o' and 'GPT-4.1' without snapshot dates or API versions. Open-source models are listed as 'qwen3-coder:480b, gemma3:27b, deepseek-r1:70b, and llama4:latest.' These are marketing names without specific version identifiers. Model behavior can change across API updates." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": false, 158 "justification": "§5.6.1 provides 'shortened version of prompts' for Advisor, Generator, and Evaluator, explicitly stating 'These excerpts preserve the key instructions and style of the full prompts while omitting implementation-specific details.' The excerpts contain placeholders like '[Input items...]' and '[Output requirements...]'. Full prompts are not provided in the paper." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "§6.4.1: temperature 0.7, hotspot budget K=50, iteration budget T=2, pass@1, 2 warm-up + 5 measured runs. JIT compiler flags detailed in §6.4.2. Open-source models run via ollama (§6.6)." 164 }, 165 "scaffolding_described": { 166 "applies": true, 167 "answer": true, 168 "justification": "The Advisor→Generator→Evaluator pipeline is described in detail (§5, Figure 4, Algorithm 1). Profiling-based hotspot identification (§5.2), AST extraction, flame graph analysis, iterative feedback loops, and correctness validation are all documented. The workflow diagram and algorithm pseudocode make the scaffold reproducible." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "§6.2.2 documents benchmark filtering: 22 DaCapo apps → 20 buildable → 8 with source+tests → 5 with Maven. Files over 1,000 lines skipped (§6.2.2). Classes without dedicated tests excluded (§5.6.3). HumanEval augmented with translated stress tests from COFFE benchmark (§6.2.1)." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": true, 180 "justification": "§8 Discussion dedicates multiple subsections to limitations (§8.1.1 progress and remaining challenges, §8.2 costs of applying SysLLMatic). §9 provides a dedicated Threats to Validity section with construct, internal, and external threats." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": true, 185 "justification": "§9 discusses threats specific to this study: temperature 0.7 causing nondeterminism across runs, 5-run averaging for hardware variance mitigation, limited language coverage (C++ and Java only), DaCapo applications being pre-built, and incomplete training data for other languages. §8.2.1 discusses specific test coverage variations (GraphChi ~10% vs ZXing ~90%)." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": true, 190 "justification": "§9 External Threats: 'there are several aspects of generalizability that are not assessed, most notably to other programming languages, application types, and hardware platforms.' §8.1.1 explicitly states limitations: assumes access to test suites, DaCapo apps pre-built, profiling data time-consuming to collect." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": false, 197 "justification": "No raw performance measurement data (individual run timings, profiling outputs, LLM outputs) is made available for independent verification. The paper reports only averaged/aggregated results." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "§6.7 describes the execution environment (Intel Xeon W-2295, 36 CPUs, 188 GB RAM). §6.3 defines each metric's measurement implementation (Linux /usr/bin/time for memory, perf stat for CPU cycles, Intel RAPL for energy). §6.4.1 specifies the measurement protocol (2 warm-up + 5 measured runs)." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. Data sources are standard public benchmarks (HumanEval-X, SciMark2, DaCapo)." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "§6.2.2 documents the DaCapo filtering pipeline with counts at each stage (22 → 20 → 8 → 5). §5 describes the full optimization pipeline from hotspot identification through code generation, validation, and profiling. The file size filtering criterion (>1,000 lines) and its impact (11% of Fop files excluded) are documented." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Acknowledgments section: 'Davis acknowledges support from NSF awards #2343596. Thiruvathukal and Läufer acknowledge support from NSF award #2343595.'" 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "All author affiliations are listed: Purdue University (Peng, Gupte, Eliopoulos, Ho, Mantri, Deng, Davis) and Loyola University Chicago (Hasler, Läufer, Thiruvathukal). No evaluated product is affiliated with these institutions." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": true, 229 "justification": "NSF is a government funding agency with no financial stake in whether LLM-based code optimization outperforms compiler baselines." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests statement or financial disclosure section is present in the paper." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "The paper uses GPT-4o, GPT-4.1, and several open-source models but never states training data cutoff dates for any of them. This is relevant because HumanEval (2021) and SciMark2 (2000) predate all models used." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "HumanEval was published in 2021 and SciMark2 in 2000 — solutions could be in training data for all tested models. No discussion of potential train/test overlap anywhere in the paper." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "HumanEval and SciMark2 were available online well before any tested model's training cutoff. The paper uses these benchmarks without any discussion of contamination risk. This is particularly relevant since the LLM must understand the code's semantics to optimize it." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study. It evaluates LLM-based code optimization on software benchmarks." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": true, 295 "justification": "Table 12 reports end-to-end optimization time, number of LLM queries, and energy consumption (low/high bounds in Wh) per DaCapo application. §7.5.2 provides break-even analysis modeling inference costs against runtime and energy savings." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "§6.7 specifies hardware (Intel Xeon W-2295 bare-metal server, NVIDIA H100 GPU cluster). Table 12 reports time and energy per application (e.g., BioJava: 47 min, 54 queries, 13.0-324 Wh). The break-even analysis in §7.5.2 models total resource consumption." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": false, 307 "justification": "Temperature is set to 0.7 (§6.4.1) introducing nondeterminism, and results use pass@1. No seed sensitivity analysis or results across multiple random seeds are reported. §9 acknowledges that 'identical prompts may yield different outputs across runs' but does not quantify this." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": true, 312 "justification": "§6.4.1: 'we perform 2 warm-up runs followed by 5 measured runs for each program, reporting the average of the measured runs.'" 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "Temperature is fixed at 0.7 'following prior work' (§6.4.1), K=50 is chosen based on covering 'all or most meaningful hotspots' (§5.6.2). While K is ablated across 3 values (50, 100, 150) in §7.4.2, no systematic hyperparameter search budget is reported." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": true, 322 "justification": "Configuration choices are justified: temperature 0.7 from prior work convention (§6.4.1), K=50 from ablation showing diminishing returns (Figure 14), T=2 iterations from system ablation (Figure 12), class-level granularity from correctness comparison (§7.4.3, Figure 13)." 323 }, 324 "multiple_comparison_correction": { 325 "applies": false, 326 "answer": false, 327 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "§6.4.2 describes re-implementing PerfCodeGen for different languages: 'We reimplemented PerfCodeGen to support our benchmarks.' No acknowledgment of the Lucic et al. (2018) concern that authors' re-implementations of baselines systematically underperform." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": true, 337 "justification": "Table 12 reports optimization cost (time, queries, energy) per application. §7.5 provides break-even analysis. The cost of LLM optimization is explicitly compared to gains. Figure 15 plots time-to-break-even as a function of execution frequency." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": true, 342 "justification": "§8.1.5 discusses whether CPU-based profiling captures the right performance dimensions, comparing hotspot profiles across event types (Figure 16). §6.2 justifies benchmark selection for different evaluation goals (synthetic vs. real-world). §8.1.3 discusses function-level vs. class-level validity." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": true, 346 "answer": true, 347 "justification": "Model comparisons in Table 9 use the same SysLLMatic framework across all models (GPT-4o, GPT-4.1, Qwen, Gemma, DeepSeek, Llama4). The PerfCodeGen comparison is explicitly a system-level comparison. §6.4.2 ensures 'same input code, configuration, environment, and evaluation metrics.'" 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "HumanEval was published in 2021, SciMark2 in 2000 — solutions and benchmarks predate all tested models' training data. No discussion of whether models may have seen solutions during training." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether evaluation setup leaks answer information. The LLM receives source code and profiling data; it's not discussed whether optimization patterns for these specific benchmarks may have appeared in training data." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "No discussion of whether benchmark programs share structural similarities with training data or with each other in ways that could inflate results." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference tests, or decontamination pipelines." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "SysLLMatic achieves average 1.5× latency and 1.76× throughput improvement on DaCapoBench, surpassing compiler optimizations (1.01× and 1.02×).", 376 "evidence": "Table 8 shows per-application results: BioJava 3.44×/4.63×, Fop 1.02×/1.07×, PMD 1.11×/1.12×, Graphchi 0.99×/0.98×, ZXing 0.95×/0.99×. Averages are heavily driven by BioJava's parallelization gains (§7.2.1).", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "SysLLMatic consistently outperforms state-of-the-art LLM baselines (PerfCodeGen) on microbenchmarks.", 381 "evidence": "Table 8: On SciMark2, SysLLMatic achieves 1.55× latency vs PerfCodeGen's 1.00×. On HumanEval, SysLLMatic achieves 88.41% correctness vs PerfCodeGen's 70%, and higher %Opt in latency (23.2% vs 15.0%).", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Compiler baselines remain highly competitive, particularly on HumanEval where they achieve higher %Opt than SysLLMatic in 3 of 4 metrics.", 386 "evidence": "Table 8: Compiler achieves 61.0% CPU cycles %Opt vs SysLLMatic's 29.9% on HumanEval. Compiler outperforms on Fop (1.08× vs 1.02× latency) and ZXing (1.40× vs 0.95× latency) in DaCapo.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "The optimization pattern catalog is essential for large-scale DaCapo applications but not for microbenchmarks.", 391 "evidence": "Table 11 ablation: Adding Advisor improves DaCapo latency from 1.07× to 1.50×, throughput from 1.06× to 1.76×. But on SciMark2, Advisor reduces latency from 6.25× to 1.55× compared to +Ctx configuration.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "Open-source LLMs achieve comparable performance on microbenchmarks but fall short on large-scale applications.", 396 "evidence": "Table 9: Qwen3-Coder:480B achieves 93.9% correctness on HumanEval (vs GPT-4o's 88.4%), but only 1.03× latency on DaCapo (vs GPT-4.1's 1.50×). DeepSeek-r1 shows 0.90× latency regression on DaCapo.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "BioJava achieves 3.44× latency and 4.63× throughput improvement through parallelization.", 401 "evidence": "Table 8 and Figure 8 show before/after code with thread pool parallelization. However, CPU cycles regress to 0.48×, reflecting concurrency overhead (§7.2.1).", 402 "supported": "strong" 403 }, 404 { 405 "claim": "SysLLMatic's optimization cost is worthwhile for high-throughput applications, with BioJava breaking even within 5-8 days under conservative usage assumptions.", 406 "evidence": "§7.5.2 models break-even at 230 executions (energy) and 414 executions (latency) for BioJava, translating to 5 days and 8 days at 50 executions/day. PMD requires months to years (Figure 15).", 407 "supported": "moderate" 408 } 409 ], 410 "red_flags": [ 411 { 412 "flag": "No variance or uncertainty quantification", 413 "detail": "Despite performing 5 measured runs per program (§6.4.1), no standard deviation, confidence intervals, or error bars are reported. With temperature 0.7 introducing nondeterminism in LLM outputs, the stability of results is unquantified. The paper acknowledges this threat (§9) but does not address it." 414 }, 415 { 416 "flag": "BioJava dominates DaCapo averages", 417 "detail": "The headline '1.5× latency, 1.76× throughput' on DaCapo is driven almost entirely by BioJava (3.44×, 4.63×). The other 4 applications average ~1.02× latency and ~1.04× throughput. Reporting the average without emphasizing this skew is misleading." 418 }, 419 { 420 "flag": "Benchmark contamination not addressed", 421 "detail": "HumanEval (2021) and SciMark2 (2000) predate all tested models' training. GPT-4o/4.1 and open-source models may have seen optimization solutions for these benchmarks. No discussion of contamination risk despite this being a well-known concern." 422 }, 423 { 424 "flag": "Baseline re-implementation", 425 "detail": "§6.4.2: PerfCodeGen was re-implemented by the authors for different languages. Lucic et al. (2018) showed that authors' re-implementations of baselines systematically underperform. The original PerfCodeGen authors were not involved in verification." 426 }, 427 { 428 "flag": "Very small DaCapo evaluation set", 429 "detail": "Only 5 of 22 DaCapo applications were used, selected for convenience (Maven build system, available test suites). The filtering is honest but the resulting sample is small and may not represent the diversity of real-world software systems." 430 }, 431 { 432 "flag": "Model versions unspecified", 433 "detail": "GPT-4o and GPT-4.1 are used without snapshot dates or API versions. Model behavior changes across versions; results may not be reproducible with future API updates." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "Language models for code optimization: Survey, challenges and future directions", 439 "authors": ["J. Gong", "V. Voskanyan", "P. Brookes"], 440 "year": 2025, 441 "arxiv_id": "2501.01277", 442 "relevance": "Major survey of LLM-based code optimization cataloging limitations of current approaches, directly motivating SysLLMatic's design." 443 }, 444 { 445 "title": "Evaluating large language models trained on code", 446 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 447 "year": 2021, 448 "arxiv_id": "2107.03374", 449 "relevance": "Introduces HumanEval benchmark used as one of three evaluation suites in this paper." 450 }, 451 { 452 "title": "EffiLearner: Enhancing efficiency of generated code via self-optimization", 453 "authors": ["D. Huang", "J. Dai", "H. Weng"], 454 "year": 2024, 455 "arxiv_id": "2405.15189", 456 "relevance": "Self-optimization pipeline using overhead profiles for code efficiency, representative of feedback-based LLM optimization approaches." 457 }, 458 { 459 "title": "PerfCodeGen: Improving performance of LLM generated code with execution feedback", 460 "authors": ["Y. Peng", "A. D. Gotmare", "M. Lyu"], 461 "year": 2024, 462 "arxiv_id": "2412.03578", 463 "relevance": "Primary LLM-based baseline, SOTA training-free framework using unit test runtime to guide performance improvements." 464 }, 465 { 466 "title": "DeepDev-PERF: a deep learning-based approach for improving software performance", 467 "authors": ["S. Garg", "R. Z. Moghaddam", "Clement"], 468 "year": 2022, 469 "relevance": "Fine-tuned BART for real-world C# performance optimization; one of few prior works targeting real-world applications." 470 }, 471 { 472 "title": "Search-Based LLMs for Code Optimization", 473 "authors": ["S. Gao", "C. Gao", "Gu"], 474 "year": 2025, 475 "relevance": "Search-based approach to LLM code optimization, part of the growing literature on automated code performance improvement." 476 }, 477 { 478 "title": "MARCO: A multi-agent system for optimizing HPC code generation using large language models", 479 "authors": ["A. Rahman", "V. Cvetkovic", "K. Reece"], 480 "year": 2025, 481 "arxiv_id": "2505.03906", 482 "relevance": "Multi-agent framework for HPC code optimization, representing the agent-based approach to code optimization." 483 }, 484 { 485 "title": "Learning performance-improving code edits", 486 "authors": ["A. G. Shypula", "A. Madaan", "Y. Zeng"], 487 "year": 2024, 488 "relevance": "Foundational work on LLM-based performance-improving code edits, establishes conventions used as baselines." 489 }, 490 { 491 "title": "When Faster Isn't Greener: The Hidden Costs of LLM-Based Code Optimization", 492 "authors": ["T. Coignion", "C. Quinton", "R. Rouvoy"], 493 "year": 2025, 494 "relevance": "Examines sustainability costs of LLM-based optimization, directly motivating SysLLMatic's break-even analysis." 495 }, 496 { 497 "title": "Large language models for software engineering: A systematic literature review", 498 "authors": ["X. Hou", "Y. Zhao", "Y. Liu"], 499 "year": 2024, 500 "relevance": "Comprehensive survey of LLMs for software engineering tasks providing context for LLM code optimization capabilities." 501 }, 502 { 503 "title": "Large Language Models for energy-efficient code: Emerging results and future directions", 504 "authors": ["H. Peng", "A. Gupte", "N. J. Eliopoulos"], 505 "year": 2024, 506 "arxiv_id": "2410.09241", 507 "relevance": "Prior work by some of the same authors on LLM-guided energy-efficient code optimization through prompting." 508 }, 509 { 510 "title": "RAPGen: An approach for fixing code inefficiencies in zero-shot", 511 "authors": ["S. Garg", "R. Z. Moghaddam", "N. Sundaresan"], 512 "year": 2025, 513 "arxiv_id": "2306.17077", 514 "relevance": "Retrieval-augmented prompting for code performance bug repair in real-world projects, one of few prior works on application-level optimization." 515 } 516 ] 517 }