scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18163B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Language Models for Code Optimization: Survey, Challenges and Future Directions",
      6     "authors": [
      7       "Jingzhi Gong",
      8       "Vardan Voskanyan",
      9       "Paul Brookes",
     10       "Fan Wu",
     11       "Wei Jie",
     12       "Jie Xu",
     13       "Rafail Giavrimis",
     14       "Mike Basios",
     15       "Leslie Kanthan",
     16       "Zheng Wang"
     17     ],
     18     "year": 2025,
     19     "venue": "arXiv.org",
     20     "arxiv_id": "2501.01277",
     21     "doi": "10.48550/arXiv.2501.01277"
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The abstract claims 53 primary studies, 11 sub-questions, 5 open challenges, and 8 future directions — all confirmed explicitly in the body (Sections 3, 8).",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper attributes adoption patterns causally (e.g., GPT-4 preferred 'due to their broader understanding and reasoning capabilities') but this is editorial inference rather than tested — the survey only counts usage frequency, not capability comparisons.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Findings are consistently bounded to the 53 reviewed primary studies with explicit counts (e.g., '81% of primary studies,' '68% were not evaluated on real-world programs'), without overclaiming beyond the corpus.",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Python's dominance is attributed to 'wide use in data science' without considering that Python's prevalence in benchmark datasets (HumanEval, MBPP) may be the actual driver; no alternative explanations for observed patterns are considered.",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 7.2 explicitly distinguishes between evaluation on competitive programming datasets and real-world code, noting the former 'may not represent the complexity of real-world programs, potentially limiting generalizability.'",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "There is no dedicated limitations or threats-to-validity section for the survey itself; Section 8 discusses challenges in the reviewed field, not limitations of the authors' own methodology.",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No threats to validity of the SLR are discussed — no mention of potential search term incompleteness, language bias (only English sources implied), or rater disagreement in paper selection.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "Figure 1 illustrates in-scope topics but does not explicitly state what is excluded and why; the year range of included papers and venue selection are not justified in the paper.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No funding acknowledgement or grant information appears anywhere in the paper text.",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Author affiliations with TurinTech AI (a commercial AI code optimization company) and universities are fully disclosed in the author information block.",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Six of ten authors are affiliated with TurinTech AI, a company that builds AI-based code optimization products — the exact domain being surveyed and validated; the institutional interest in a positive framing of the field is not addressed.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears in the paper.",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 2.1 precisely defines 'code optimization' (transforming programs at source/IR/binary level to achieve performance goals while preserving functionality) and distinguishes it from code generation, refactoring, and repair.",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract and Section 1 explicitly state the contribution: 'a systematic literature review of over 50 primary studies' filling 'a significant gap' — no prior comprehensive survey on LM-based code optimization existed.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper positions itself against existing surveys on LLMs for SE generally [57] and APR specifically [155], explaining why its narrower focus on code optimization is a distinct contribution.",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "survey": {
    125       "search_and_selection": {
    126         "search_strategy_reproducible": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The paper states search strategy follows 'quasi-gold standard methodology' and searched 'six academic indexing engines,' but the actual search string and database names are relegated to an external GitHub repository with footnote 3 citing space constraints.",
    130           "source": "haiku"
    131         },
    132         "inclusion_exclusion_explicit": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Inclusion/exclusion criteria are mentioned as 'rigorous' in Section 3 but are not stated in the paper; they are outsourced to the GitHub repository, making in-paper assessment impossible.",
    136           "source": "haiku"
    137         },
    138         "prisma_or_structured_protocol": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "Section 3 explicitly states the survey follows Kitchenham and Charters [69] SLR guidelines for software engineering, a recognized structured review protocol.",
    142           "source": "haiku"
    143         },
    144         "search_terms_provided": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "The actual search string/queries are not provided in the paper; only a brief mention of 'carefully defined search string' appears, with full details deferred to the GitHub repository.",
    148           "source": "haiku"
    149         },
    150         "databases_listed": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "The paper states six 'academic indexing engines' were searched but does not name them anywhere in the paper text; this information is also deferred to the external repository.",
    154           "source": "haiku"
    155         },
    156         "screening_process_documented": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Figure 4 shows a high-level three-stage process with only the final count (53 primary studies from 2,346), without documenting how many were excluded at each stage or inter-rater reliability for screening decisions.",
    160           "source": "haiku"
    161         },
    162         "review_scope_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "The paper does not justify the temporal scope of the review (no year range is stated), venue selection, or why exactly 53 studies were sufficient for a 'comprehensive' survey.",
    166           "source": "haiku"
    167         }
    168       },
    169       "synthesis_quality": {
    170         "conflicting_findings_acknowledged": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "The synthesis presents aggregate statistics and taxonomies but does not acknowledge cases where reviewed primary studies reached conflicting conclusions about LM effectiveness for code optimization.",
    174           "source": "haiku"
    175         },
    176         "quality_assessment_of_sources": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "Section 3 mentions 'quality assessments' as part of study selection, but no rubric, criteria, or quality scores for the 53 primary studies are presented in the paper; studies are treated as homogeneous once included.",
    180           "source": "haiku"
    181         },
    182         "publication_bias_discussed": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "No mention of publication bias appears anywhere in the paper; the possibility that negative results about LM code optimization are underrepresented in the corpus is not acknowledged.",
    186           "source": "haiku"
    187         },
    188         "quantitative_synthesis_present": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "The paper provides systematic vote-counting with percentages throughout (e.g., 57% off-the-shelf, 81% single-language, 68% no real-world evaluation) and structured tables with instance counts across all taxonomic categories.",
    192           "source": "haiku"
    193         },
    194         "recommendations_supported_by_evidence": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Each of the 8 future directions is directly tied to a quantified gap identified in the 53 reviewed papers (e.g., 'multi-objective optimization' recommended because 79% of papers target single metrics); recommendations don't significantly exceed the evidence base.",
    198           "source": "haiku"
    199         }
    200       }
    201     }
    202   },
    203   "claims": [
    204     {
    205       "claim": "General-purpose LMs (61 instances) were more widely adopted than code-specialized LMs (43 instances) for code optimization.",
    206       "evidence": "Direct count from Table 1 across 53 primary studies; GPT-4 alone used in 15 studies.",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "57% of studies used off-the-shelf pre-trained models while 43% employed fine-tuning.",
    211       "evidence": "Figure 7 showing distribution of training approaches across 53 primary studies.",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "81% of primary studies focused on optimizing a single programming language.",
    216       "evidence": "Figure 8 showing language distribution; Python dominated with 30 of 53 studies.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "68% of studies did not evaluate code optimization on real-world programs, and only 9% used full real-world projects.",
    221       "evidence": "Figure 10 with breakdown: 36 studies (68%) no real-world, 12 (23%) snippets, 5 (9%) full projects.",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "Feedback-based iterative optimization was the dominant model-based technique (35 of 51 model-based instances).",
    226       "evidence": "Table 3 showing distribution of code optimization techniques across primary studies.",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "79% of studies optimized for a single performance metric, predominantly runtime.",
    231       "evidence": "Figure 9 and Table 6; runtime used in 24 studies, 42/53 studies target one metric.",
    232       "supported": "strong"
    233     },
    234     {
    235       "claim": "LMs currently struggle with larger real-world programs and often yield marginal improvements over traditional compilers.",
    236       "evidence": "Citing Romero Rosas et al. [113]; this is a secondary claim from a reviewed study, not the survey authors' own finding.",
    237       "supported": "moderate"
    238     }
    239   ],
    240   "methodology_tags": [
    241     "qualitative",
    242     "meta-analysis"
    243   ],
    244   "key_findings": "A systematic review of 53 primary studies finds that general-purpose LMs (especially GPT-4) dominate LM-based code optimization over specialized models, and the majority (57%) rely on off-the-shelf pretrained models. Critical evaluation gaps are identified: 81% of studies target a single programming language, 68% do not test on real-world code, and only 9% use full real-world projects. Five open challenges are identified — balancing model complexity with practicality, limited external system integration, poor cross-language/metric generalizability, insufficient real-world evaluation, and trust/reliability concerns — with eight corresponding future research directions including model compression, agentic LMs, and RLHF.",
    245   "red_flags": [
    246     {
    247       "flag": "Undisclosed commercial conflict",
    248       "detail": "Six of ten authors are affiliated with TurinTech AI, a commercial company building AI code optimization products — the exact field being positively surveyed. No competing interests statement is provided."
    249     },
    250     {
    251       "flag": "Key methodology deferred to external repo",
    252       "detail": "Search terms, database names, and inclusion/exclusion criteria are all relegated to a GitHub repository rather than stated in the paper, making the review non-reproducible from the paper alone."
    253     },
    254     {
    255       "flag": "No limitations section",
    256       "detail": "The survey has no section discussing its own methodological limitations, threats to validity, or potential for search incompleteness — despite following an SLR protocol that typically requires this."
    257     },
    258     {
    259       "flag": "Publication bias unaddressed",
    260       "detail": "The positive framing of LM-based code optimization is never qualified by acknowledging that published studies overwhelmingly report positive results; negative findings are structurally absent from the corpus."
    261     },
    262     {
    263       "flag": "No source quality assessment",
    264       "detail": "The 53 included papers are treated as homogeneous after inclusion; no quality rubric or risk-of-bias scoring is applied to distinguish strong from weak primary studies."
    265     },
    266     {
    267       "flag": "Screening counts absent",
    268       "detail": "Figure 4 shows a total of 2,346 initial papers reduced to 53, but provides no counts at intermediate screening stages, preventing assessment of attrition."
    269     }
    270   ],
    271   "cited_papers": [
    272     {
    273       "title": "Guidelines for Performing Systematic Literature Reviews in Software Engineering",
    274       "relevance": "Foundational SLR methodology the survey explicitly follows (Kitchenham and Charters [69])"
    275     },
    276     {
    277       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    278       "relevance": "Prior SLR on LLMs for SE generally; this survey positions itself as more narrowly focused on code optimization"
    279     },
    280     {
    281       "title": "Large Language Model-Based Agents for Software Engineering: A Survey",
    282       "relevance": "Related survey on LLM agents for SE; cited as motivation for agentic future directions"
    283     },
    284     {
    285       "title": "Learning Performance-Improving Code Edits",
    286       "relevance": "Key primary study introducing the PIE dataset (77K slow-fast code pairs), most cited benchmark in the survey"
    287     },
    288     {
    289       "title": "Mathematical Discoveries from Program Search with Large Language Models",
    290       "relevance": "FunSearch — high-profile primary study using LLMs for algorithmic code optimization (DeepMind/Nature 2024)"
    291     },
    292     {
    293       "title": "Meta Large Language Model Compiler: Foundation Models of Compiler Optimization",
    294       "relevance": "Key primary study on LLM-based compiler optimization at the IR/assembly level"
    295     },
    296     {
    297       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    298       "relevance": "Representative feedback-based iterative optimization approach cited multiple times as paradigmatic technique"
    299     },
    300     {
    301       "title": "ECCO: Can We Improve Model-Generated Code Efficiency Without Sacrificing Functional Correctness?",
    302       "relevance": "Key benchmark dataset and evaluation framework for code efficiency optimization"
    303     },
    304     {
    305       "title": "Should AI Optimize Your Code? A Comparative Study of Current Large Language Models Versus Classical Optimizing Compilers",
    306       "relevance": "Critical empirical study finding LLMs struggle with larger programs vs traditional compilers — the main contrarian finding cited"
    307     }
    308   ],
    309   "engagement_factors": {
    310     "practical_relevance": {
    311       "score": 2,
    312       "justification": "Provides a taxonomy and model selection guide (Table 1) directly useful to practitioners choosing LMs for code optimization tasks."
    313     },
    314     "surprise_contrarian": {
    315       "score": 1,
    316       "justification": "The finding that 68% of studies don't test on real-world code and only 9% use full projects is mildly surprising given the applied framing of most papers."
    317     },
    318     "fear_safety": {
    319       "score": 0,
    320       "justification": "No AI safety, misuse, or risk concerns discussed; the paper is purely technical."
    321     },
    322     "drama_conflict": {
    323       "score": 0,
    324       "justification": "No controversy or conflicting claims between research groups; the survey is descriptive and non-polemical."
    325     },
    326     "demo_ability": {
    327       "score": 1,
    328       "justification": "Links to GitHub repository with full methodology and raw results, but no interactive demo or runnable artifact."
    329     },
    330     "brand_recognition": {
    331       "score": 1,
    332       "justification": "Survey covers GPT-4, Code LLaMA, DeepSeek, and other well-known models; authors are from TurinTech AI and University of Leeds, which are moderately recognized in the SE/compilers space."
    333     }
    334   },
    335   "hn_data": {
    336     "threads": [],
    337     "top_points": 0,
    338     "total_points": 0,
    339     "total_comments": 0
    340   }
    341 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs