scan.json (26472B)
1 { 2 "paper": { 3 "title": "Language Models for Code Optimization: Survey, Challenges and Future Directions", 4 "authors": [ 5 "Jingzhi Gong", 6 "Vardan Voskanyan", 7 "Paul Brookes", 8 "Fan Wu", 9 "Wei Jie", 10 "Jie Xu", 11 "Rafail Giavrimis", 12 "Mike Basios", 13 "Leslie Kanthan", 14 "Zheng Wang" 15 ], 16 "year": 2025, 17 "venue": "ACM Computing Surveys (arXiv preprint)", 18 "arxiv_id": "2501.01277", 19 "doi": "10.48550/arXiv.2501.01277" 20 }, 21 "scan_version": 3, 22 "active_modules": ["survey_methodology"], 23 "methodology_tags": ["meta-analysis"], 24 "key_findings": "This SLR of 53 primary studies finds that general-purpose LMs (61 instances) were more widely adopted for code optimization than code-specialized LMs (43 instances), with 57% of studies using off-the-shelf models. 81% of studies targeted a single programming language and 79% a single performance metric, highlighting limited generalizability. Only 32% of studies evaluated on real-world code, with 68% relying on competitive programming or synthetic datasets. Five open challenges are identified including balancing model complexity with practicality, limited interaction with external systems, and trust/reliability in AI-driven optimization.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper states 'Full list of studies and all the raw results of this survey can be accessed at: https://github.com/gjz78910/CodeOpt-SLR' (Section 1, footnote 1). A repository URL is provided." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "The GitHub repository is stated to contain 'all the raw results of this survey,' and the full methodology and related works are also hosted there. The extracted data from the 53 primary studies appears to be released." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "No environment specifications, dependency files, or setup instructions are mentioned in the paper for reproducing the survey analysis." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "No step-by-step reproduction instructions are provided in the paper. The methodology is described at a high level in Section 3, but specific steps to reproduce the analysis are not given." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": false, 51 "answer": false, 52 "justification": "This is a survey paper that counts and categorizes studies. It does not perform statistical aggregation or meta-analysis requiring confidence intervals." 53 }, 54 "significance_tests": { 55 "applies": false, 56 "answer": false, 57 "justification": "The survey reports descriptive statistics (counts, percentages) of categorized studies. No comparative statistical claims requiring significance tests are made." 58 }, 59 "effect_sizes_reported": { 60 "applies": false, 61 "answer": false, 62 "justification": "No effect sizes are relevant for a categorization-based survey without statistical aggregation." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper identifies 53 primary studies from 2,346 searched studies but does not justify whether 53 is sufficient for the breadth of conclusions drawn, nor discuss whether the search may have missed relevant studies beyond acknowledging the quasi-gold standard approach." 68 }, 69 "variance_reported": { 70 "applies": false, 71 "answer": false, 72 "justification": "No experimental runs or statistical aggregation requiring variance reporting." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": false, 79 "justification": "The paper mentions that existing reviews focus on general SE applications or program repair (Section 1) but does not formally compare its scope, methodology, or findings against those prior surveys." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": false, 84 "justification": "Prior surveys are referenced but not systematically compared. The paper claims to fill a gap but does not benchmark its coverage or findings against specific prior reviews." 85 }, 86 "ablation_study": { 87 "applies": false, 88 "answer": false, 89 "justification": "No system with components to ablate; this is a survey paper." 90 }, 91 "multiple_metrics": { 92 "applies": false, 93 "answer": false, 94 "justification": "No experiments are conducted; this is a survey paper." 95 }, 96 "human_evaluation": { 97 "applies": false, 98 "answer": false, 99 "justification": "No experimental outputs to evaluate; this is a survey paper." 100 }, 101 "held_out_test_set": { 102 "applies": false, 103 "answer": false, 104 "justification": "No experiments with train/test splits; this is a survey paper." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "The survey provides detailed breakdowns across multiple dimensions: LM types (Table 1), model sizes (Figure 6), training approaches (Figure 7), challenges (Table 2), optimization techniques (Table 3), LM roles (Table 4), programming languages (Table 5), performance metrics (Table 6), datasets (Table 7), and evaluation approaches (Figures 10-11)." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 8 discusses five open challenges including limited generalizability, lack of real-world evaluation, hallucination issues, and trust/reliability concerns. The paper also notes limitations of each technique category throughout Sections 4-7." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper reports several negative findings: 68% of studies were not evaluated on real-world code (Section 7.2), 81% focused on single languages (Section 6.1), and LMs 'currently struggle with larger programs and often yield marginal improvements over traditional compilers' (Section 2.3, citing reference [113])." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The abstract claims about general-purpose LMs (61 instances), 57% pre-trained, 43% fine-tuned, and the specific challenge counts are all supported by the corresponding tables and figures in the body (Tables 1-3, Figure 7)." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper makes causal claims such as 'General-purpose LMs like GPT-4 were more widely adopted... due to their broader understanding and reasoning capabilities' (Section 4.1) without evidence for the causal mechanism. The 'due to' attributions for adoption patterns are unsubstantiated interpretations rather than findings from the survey data." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": true, 136 "justification": "The scope is clearly bounded to 'LM-based approaches for code optimization' (Section 1), and Figure 1 explicitly shows the survey scope. The title accurately reflects the content, and conclusions are generally tied to the surveyed literature." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper presents observed patterns (e.g., dominance of general-purpose LMs, preference for single-language studies) with single explanations and does not consider alternative reasons. For instance, the popularity of GPT-4 could be due to API accessibility rather than 'broader understanding,' but alternatives are not discussed." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper's claims are at the level of its measurements — counting and categorizing studies. It does not frame study counts as proxies for broader phenomena. When it draws conclusions about the field, these are stated as observations from the surveyed literature." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": false, 152 "answer": false, 153 "justification": "No AI models were used in conducting this survey." 154 }, 155 "prompts_provided": { 156 "applies": false, 157 "answer": false, 158 "justification": "No prompting was used in this survey." 159 }, 160 "hyperparameters_reported": { 161 "applies": false, 162 "answer": false, 163 "justification": "No models or hyperparameters involved in conducting this survey." 164 }, 165 "scaffolding_described": { 166 "applies": false, 167 "answer": false, 168 "justification": "No agentic scaffolding used in this survey." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 3 describes the three-stage methodology: (1) search using six academic indexing engines with a defined search string and snowballing, (2) study selection using inclusion/exclusion criteria and quality assessment, (3) data collection guided by 4 RQs with 11 sub-questions. Figure 4 shows the pipeline with counts (2,346 → 53). However, full criteria details are deferred to the GitHub repo." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": false, 180 "justification": "There is no dedicated limitations section for the survey methodology itself. Section 8 discusses challenges in the field (limitations of LM-based code optimization), not limitations of this survey. The conclusion briefly notes 'it is impossible to provide a definitive cataloger of all research' but this is a single sentence, not substantive discussion." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": false, 185 "justification": "No threats to the validity of the survey methodology are discussed. There is no mention of potential biases in the search process, coding reliability, or limitations of the categorization approach." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": true, 190 "justification": "The paper explicitly distinguishes code optimization from code generation, refactoring, and repair (Section 2.1). Figure 1 visually delineates the survey scope. The definition of code optimization is clearly stated as 'techniques that enhance performance objectives while preserving the original functionality.'" 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": true, 197 "justification": "The GitHub repository (https://github.com/gjz78910/CodeOpt-SLR) is stated to contain 'all the raw results of this survey,' enabling independent verification of the categorizations." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Section 3 describes the data collection procedure: comprehensive automatic searches through six academic indexing engines using a defined search string following the quasi-gold standard methodology, supplemented by snowballing searches. The three-stage process is outlined in Figure 4." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. The paper selection process is described under data_collection_described and data_preprocessing_documented." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "Figure 4 documents the pipeline: manual search (10 studies for quasi-gold standard) + automatic search (2,310 studies) + snowballing → 2,346 total → inclusion/exclusion criteria → quality assessment → 53 primary studies → data collection and analysis. Counts at each stage are provided." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": false, 219 "justification": "No funding sources, grants, or acknowledgments section is present in the paper." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Author affiliations are clearly listed. Multiple authors (Voskanyan, Brookes, Wu, Giavrimis, Basios, Kanthan) are affiliated with TurinTech AI, a company that works on AI-based code optimization. University affiliations are also listed." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "No funding is disclosed. With six authors from TurinTech AI (a company commercializing AI for code optimization), the lack of any funding or conflict disclosure is a concern." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests or financial interests statement is present. Six authors are from TurinTech AI, which has a commercial interest in the field being surveyed, but this potential conflict is not acknowledged." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": false, 240 "answer": false, 241 "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": false, 245 "answer": false, 246 "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": false, 250 "answer": false, 251 "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this survey." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this survey." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this survey." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this survey." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this survey." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this survey." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this survey." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": false, 294 "answer": false, 295 "justification": "This is a survey paper with no computational method of its own." 296 }, 297 "compute_budget_stated": { 298 "applies": false, 299 "answer": false, 300 "justification": "This is a survey paper with no computational method of its own." 301 } 302 }, 303 "survey_methodology": { 304 "prisma_or_structured_protocol": { 305 "applies": true, 306 "answer": true, 307 "justification": "The paper follows the Kitchenham and Charters (2007) guidelines for SLRs in Software Engineering (Section 3). It uses a quasi-gold standard methodology for search string construction, applies structured inclusion/exclusion criteria, and documents the pipeline in Figure 4 with a flow diagram showing counts at each stage." 308 }, 309 "quality_assessment_of_sources": { 310 "applies": true, 311 "answer": false, 312 "justification": "Quality assessment is mentioned as part of the selection pipeline (Figure 4) for inclusion/exclusion purposes, but the survey does not assess or report the methodological quality of the 53 included studies. All studies are treated equally in the analysis regardless of their rigor. No quality scores, risk-of-bias ratings, or strength-of-evidence assessments are applied." 313 }, 314 "publication_bias_discussed": { 315 "applies": true, 316 "answer": false, 317 "justification": "No discussion of publication bias. The survey does not consider whether the 53 included studies skew toward positive results, does not use funnel plots or tests for publication bias, and does not acknowledge that published LM-for-code-optimization papers may disproportionately report successes." 318 } 319 } 320 }, 321 "claims": [ 322 { 323 "claim": "General-purpose LMs were more widely adopted (61 instances) than code-specialized LMs (43 instances) for code optimization, with GPT-4 being the most frequently used (15 studies).", 324 "evidence": "Table 1 in Section 4.1 provides a detailed breakdown of all LMs used across the 53 primary studies, listing each model's usage count with study references.", 325 "supported": "strong" 326 }, 327 { 328 "claim": "57% of studies leveraged off-the-shelf pre-trained LMs while 43% fine-tuned models for specific tasks.", 329 "evidence": "Figure 7 in Section 4.3 shows the distribution, with the underlying study references provided for each category.", 330 "supported": "strong" 331 }, 332 { 333 "claim": "The most commonly highlighted challenges were limitation of one-step optimization (18 studies), balancing correctness and efficiency (15 studies), and complexity of code syntax (10 studies).", 334 "evidence": "Table 2 in Section 5.1 provides the full categorization of challenges with specific study references for each.", 335 "supported": "strong" 336 }, 337 { 338 "claim": "81% of studies focused on a single programming language and 79% targeted a single performance metric.", 339 "evidence": "Figures 8 and 9 in Section 6 show the distributions, with 43/53 studies on one language and 42/53 on one metric.", 340 "supported": "strong" 341 }, 342 { 343 "claim": "68% of studies (36 of 53) were not evaluated on real-world code projects, relying instead on competitive programming or synthetic datasets.", 344 "evidence": "Figure 10 and Section 7.2 break down evaluation into no real-world (36), code snippets (12), and full projects (5).", 345 "supported": "strong" 346 }, 347 { 348 "claim": "Model-based techniques were the most popular approach (51 instances), followed by prompt engineering (34 instances) and problem formulation (33 instances).", 349 "evidence": "Table 3 in Section 5.2 provides the full categorization with study references for each technique and sub-category.", 350 "supported": "strong" 351 } 352 ], 353 "red_flags": [ 354 { 355 "flag": "Undisclosed conflict of interest", 356 "detail": "Six of ten authors are affiliated with TurinTech AI, a company that commercializes AI-based code optimization tools (their product Artemis++ is cited as reference [43]). No conflict of interest statement, competing interests declaration, or funding disclosure is present. A survey of the field by employees of a company operating in that field warrants explicit disclosure." 357 }, 358 { 359 "flag": "No quality assessment of surveyed studies", 360 "detail": "The survey treats all 53 included studies equally without assessing their methodological quality, risk of bias, or strength of evidence. This risks laundering weak or poorly-designed studies alongside rigorous ones, making the aggregate counts and percentages potentially misleading." 361 }, 362 { 363 "flag": "Publication bias not addressed", 364 "detail": "The survey does not discuss whether the included studies skew toward positive results. Published LM-for-code-optimization papers likely overrepresent successes, but this systematic bias is not acknowledged or mitigated." 365 }, 366 { 367 "flag": "Key methodology details deferred to external repository", 368 "detail": "The full search methodology, related works, and inclusion/exclusion criteria details are not in the paper but deferred to the GitHub repo ('Due to space limitation, the full methodology can be accessed in our repository'). This makes independent assessment of the survey's rigor dependent on external resources." 369 }, 370 { 371 "flag": "No limitations section for the survey itself", 372 "detail": "While Section 8 discusses challenges in the field, there is no discussion of limitations of the survey methodology — no threats to validity, no inter-coder reliability assessment, no discussion of potential gaps in the search strategy." 373 } 374 ], 375 "cited_papers": [ 376 { 377 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 378 "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"], 379 "year": 2023, 380 "relevance": "Key LLM self-improvement technique applied to code optimization, demonstrating iterative feedback mechanisms for code reasoning." 381 }, 382 { 383 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 384 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"], 385 "year": 2023, 386 "relevance": "Agentic LLM framework using verbal reinforcement learning for code optimization and decision-making tasks." 387 }, 388 { 389 "title": "Code Llama: Open Foundation Models for Code", 390 "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"], 391 "year": 2023, 392 "arxiv_id": "2308.12950", 393 "relevance": "Most widely used code-specialized LM in the surveyed studies (11 instances), foundational model for code optimization research." 394 }, 395 { 396 "title": "Large Language Models for Compiler Optimization", 397 "authors": ["Chris Cummins", "Volker Seeker", "Dejan Grubisic"], 398 "year": 2023, 399 "arxiv_id": "2309.07062", 400 "relevance": "Pioneering work on LLMs as compiler emulators for code optimization at the IR level." 401 }, 402 { 403 "title": "Large Language Model-Based Agents for Software Engineering: A Survey", 404 "authors": ["Junwei Liu", "Kaixin Wang", "Yixuan Chen"], 405 "year": 2024, 406 "arxiv_id": "2409.02977", 407 "relevance": "Comprehensive survey of LLM-based agents for software engineering tasks including code optimization." 408 }, 409 { 410 "title": "Large Language Models for Software Engineering: A Systematic Literature Review", 411 "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"], 412 "year": 2023, 413 "relevance": "Prior SLR on LLMs in software engineering, positioned as complementary to this code-optimization-specific survey." 414 }, 415 { 416 "title": "A Systematic Literature Review on Large Language Models for Automated Program Repair", 417 "authors": ["Quanjun Zhang", "Chunrong Fang", "Yang Xie"], 418 "year": 2024, 419 "arxiv_id": "2405.01466", 420 "relevance": "Related SLR focusing on LLMs for automated program repair, adjacent to code optimization." 421 }, 422 { 423 "title": "Evaluating Large Language Models Trained on Code", 424 "authors": ["Mark Chen", "Jerry Tworek"], 425 "year": 2021, 426 "arxiv_id": "2107.03374", 427 "relevance": "Introduces HumanEval and Codex, foundational benchmarks and models used across the surveyed code optimization studies." 428 }, 429 { 430 "title": "Learning Performance-Improving Code Edits", 431 "authors": ["Alexander Shypula", "Aman Madaan", "Yimeng Zeng"], 432 "year": 2024, 433 "relevance": "Introduces the PIE dataset with 77K slow-fast code pairs, a foundational dataset for LM-based code performance optimization." 434 }, 435 { 436 "title": "Mathematical Discoveries from Program Search with Large Language Models", 437 "authors": ["Bernardino Romera-Paredes", "Mohammadamin Barekatain", "Alexander Novikov"], 438 "year": 2024, 439 "relevance": "FunSearch: demonstrates LLMs searching program space for optimization, achieving mathematical discoveries through code." 440 }, 441 { 442 "title": "Search-Based LLMs for Code Optimization", 443 "authors": ["Shuzheng Gao", "Cuiyun Gao", "Wenchao Gu", "Michael Lyu"], 444 "year": 2024, 445 "relevance": "SBLLM framework combining evolutionary search with LLMs for code optimization, addressing the search-based approach category." 446 }, 447 { 448 "title": "DeepDev-PERF: A Deep Learning-Based Approach for Improving Software Performance", 449 "authors": ["Spandan Garg", "Roshanak Zilouchian Moghaddam", "Colin B. Clement"], 450 "year": 2022, 451 "relevance": "Deep learning approach for real-world C# performance optimization using GitHub data, one of few studies evaluating on full projects." 452 } 453 ], 454 "engagement_factors": { 455 "practical_relevance": { 456 "score": 1, 457 "justification": "As a survey, it provides useful reference tables and taxonomy for researchers but is not directly actionable for practitioners." 458 }, 459 "surprise_contrarian": { 460 "score": 1, 461 "justification": "Finding that 68% of studies lack real-world evaluation and 81% target single languages is mildly surprising but not deeply contrarian." 462 }, 463 "fear_safety": { 464 "score": 0, 465 "justification": "No AI safety or security concerns are raised; the paper focuses on code performance optimization." 466 }, 467 "drama_conflict": { 468 "score": 0, 469 "justification": "No controversy or dramatic claims; straightforward survey with moderate conclusions." 470 }, 471 "demo_ability": { 472 "score": 0, 473 "justification": "No tool, demo, or runnable artifact; the GitHub repo contains survey data only." 474 }, 475 "brand_recognition": { 476 "score": 1, 477 "justification": "Discusses well-known models (GPT-4, Copilot, Code Llama) but the authoring institutions are not major AI labs." 478 } 479 } 480 }