scan.json (19196B)
1 { 2 "paper": { 3 "title": "Compiler.next: A Search-Based Compiler to Power the AI-Native Future of Software Engineering", 4 "authors": ["Filipe R. Cogo", "Gustavo A. Oliva", "Ahmed E. Hassan"], 5 "year": 2025, 6 "venue": "Manuscript submitted to ACM", 7 "arxiv_id": "2510.24799" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive is provided. The paper describes a prototype but does not release it." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The evaluation uses HumanEval-Plus (public benchmark), but no compilation traces, gold labels, or experimental data are released." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No environment specifications, dependency lists, or setup instructions are provided." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No reproduction instructions are provided for the proof-of-concept experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Tables 1 and 2 report single point estimates with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "Claims of improvement (e.g., 46.4% accuracy improvement) are made by comparing two numbers with no statistical tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Tables 1 and 2 report percentage improvements with baseline context (e.g., accuracy from 0.26 to 0.56, latency from 14.2 to 10.8s)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "Five generations of ten candidates per task with 70/30 train/test split of HumanEval-Plus is described but not justified. No discussion of whether this is sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Results appear to be from single runs. No variance, standard deviation, or multiple-run results are reported." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The 'Initial' synthesized prompt serves as a baseline compared against the 'Optimized' prompt in Table 1. Table 2 compares with and without caching." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "No comparison against other prompt optimization systems (DSPy, TextGrad, EvoPrompt, etc.) despite discussing them extensively in related work." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study isolating contributions of individual Compiler.next components (e.g., self-reflection, crossover, mutation, caching)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Three metrics are reported: accuracy, average latency, and average number of tokens." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a compiler/optimization system evaluated via automated code execution metrics; human evaluation is not relevant." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "70% of HumanEval-Plus tasks used as gold labels for optimization, 30% held out for evaluation." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": false, 93 "justification": "Only aggregate results are reported in Tables 1 and 2. No per-task or per-category breakdown is provided." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No failure cases or error analysis of the compilation process or generated code is discussed." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 2 shows caching reduces accuracy from 1.00 to 0.70 (-30%) and increases latency, honestly reporting the trade-off cost." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": false, 110 "justification": "The abstract claims Compiler.next enables 'seamless evolution of AI-native software systems' and 'democratizing software development' — these are unsupported vision statements far beyond what the small proof-of-concept demonstrates." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper claims Compiler.next 'improves accuracy while also reducing latency and execution cost' (causal language) based on a single-run comparison of initial vs. optimized prompts with no controls for confounds." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The evaluation uses only HumanEval-Plus (Python code generation) with two models, but the paper positions Compiler.next as a general-purpose compiler for all FMware. The title claims 'AI-Native Future of Software Engineering' broadly." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations for the observed improvements are discussed. For example, simple prompt engineering without the full compiler infrastructure might achieve similar results." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Models listed as 'Qwen2.5-7B-Instruct' and 'GPT-4o-mini' without snapshot dates or API versions." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The intent string is given ('Generate source code from documentation') but the actual system prompts, mutation prompts, crossover prompts, and self-reflection prompts used by the compiler are not provided." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Some hyperparameters are reported: 5 generations, 10 candidates per task, 70/30 split, Euclidean similarity threshold of 0.85 for caching." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The search mechanism, NSGA-II optimizer, caching architecture, and overall compilation pipeline are described in detail in Section 4 with figures." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "No description of how HumanEval-Plus tasks were split into the 70/30 partition or any preprocessing applied." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "No dedicated limitations or threats-to-validity section. The paper ends with a conclusion and disclaimer." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No threats to validity are discussed." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 1 states: 'intent compilation represents a foundational capability... but it is not the entirety of software engineering in the SE 3.0 era' and lists complementary advances needed. Section 3.6 positions the work relative to MBSE and self-adaptive systems." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw data, compilation traces, or experimental logs are made available." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The evaluation data source (HumanEval-Plus benchmark) is clearly identified with the 70/30 split and evaluation procedure described." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; data is from a standard benchmark." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The high-level optimization loop is described but specific details of data flow (e.g., how many tasks total, which tasks in train vs. test) are not documented." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section. Two of three authors are from Huawei Canada but no funding disclosure." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: two from Centre for Software Excellence - Huawei Canada, one from Queen's University." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "Two authors are Huawei employees. Huawei has commercial interest in AI-native development tools, making the funder non-independent of the outcome. No disclosure of this conflict." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement. Authors from Huawei may have financial interests in commercializing this technology." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No training data cutoff dates stated for either Qwen2.5-7B-Instruct or GPT-4o-mini." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "HumanEval-Plus is a well-known public benchmark likely in both models' training data. No discussion of potential contamination." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "HumanEval was published in 2021 and is widely known. Both models were trained after this date. No contamination analysis is performed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Token counts and latency per run are reported in Tables 1 and 2. Total compilation runtime is reported in Table 2 (8m:15s vs 10m:27s)." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "Total API cost, GPU usage for Qwen2.5-7B-Instruct, and overall computational budget are not stated." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Compiler.next improves accuracy from 0.26 to 0.56 for Qwen2.5-7B-Instruct and from 0.68 to 1.00 for GPT-4o-mini on HumanEval-Plus", 286 "evidence": "Table 1 in Section 4.3 shows these results comparing initial synthesized prompts vs. best optimized prompts after 5 generations.", 287 "supported": "weak" 288 }, 289 { 290 "claim": "Compiler.next reduces latency and token usage while improving accuracy", 291 "evidence": "Table 1 shows latency reduced from 14.2s to 10.8s (Qwen) and 8.7s to 5.0s (GPT-4o-mini), and tokens reduced from 537 to 369 (Qwen) and 500 to 417 (GPT-4o-mini).", 292 "supported": "weak" 293 }, 294 { 295 "claim": "Semantic caching provides 22.1% speed-up in compilation time", 296 "evidence": "Table 2 shows total running time of 10m:27s without cache vs 8m:15s with cache. However, caching reduced accuracy from 1.00 to 0.70.", 297 "supported": "weak" 298 }, 299 { 300 "claim": "Compiler.next enables the seamless evolution of AI-native software systems as part of the SE 3.0 era", 301 "evidence": "This is a vision claim supported by architectural descriptions and the proof-of-concept, but no evidence of 'seamless evolution' or real-world deployment.", 302 "supported": "unsupported" 303 } 304 ], 305 "methodology_tags": ["theoretical", "benchmark-eval", "case-study"], 306 "key_findings": "Compiler.next is proposed as a search-based compiler that uses NSGA-II to optimize FMware configurations (prompts, model parameters, cognitive architectures) from human intents. A proof-of-concept on HumanEval-Plus shows the system can improve code generation accuracy for two models while reducing latency and token usage, though at the cost of exploration diversity when caching is enabled. The paper is primarily a vision paper with 10 calls for action for the SE community, with only a small-scale validation.", 307 "red_flags": [ 308 { 309 "flag": "Company evaluating its own vision", 310 "detail": "Two of three authors are Huawei employees proposing a vision that aligns with Huawei's commercial interests in AI development platforms. No conflict of interest disclosure." 311 }, 312 { 313 "flag": "Vision claims far exceed evidence", 314 "detail": "The paper claims to enable 'democratizing software development' and 'seamless evolution of AI-native software systems' based on a small proof-of-concept with 5 generations of 10 candidates on a single benchmark." 315 }, 316 { 317 "flag": "No comparison with existing tools", 318 "detail": "Despite extensively discussing DSPy, TextGrad, EvoPrompt, ProTeGi, and SAMMO in related work, no experimental comparison is made against any of them." 319 }, 320 { 321 "flag": "Single-run results with no uncertainty quantification", 322 "detail": "Tables 1 and 2 appear to report single-run results with no error bars, confidence intervals, or multiple-run statistics, making it impossible to assess result stability." 323 }, 324 { 325 "flag": "Benchmark contamination risk", 326 "detail": "HumanEval-Plus (published 2021) is used without any discussion of contamination for models trained well after its publication." 327 }, 328 { 329 "flag": "GPT-4o-mini reaches 100% accuracy", 330 "detail": "GPT-4o-mini achieves 1.00 accuracy on the 30% held-out HumanEval-Plus tasks, which is suspiciously high and could indicate contamination or a very easy subset." 331 } 332 ], 333 "cited_papers": [ 334 { 335 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 336 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. Narasimhan"], 337 "year": 2024, 338 "arxiv_id": "2310.06770", 339 "relevance": "Major benchmark for evaluating LLM coding agents on real-world software engineering tasks." 340 }, 341 { 342 "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines", 343 "authors": ["O. Khattab", "A. Singhvi", "P. Maheshwari"], 344 "year": 2023, 345 "arxiv_id": "2310.03714", 346 "relevance": "Key prior work on programmatic optimization of LLM pipelines, directly comparable to Compiler.next." 347 }, 348 { 349 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 350 "authors": ["Q. Wu", "G. Bansal", "J. Zhang"], 351 "year": 2023, 352 "arxiv_id": "2308.08155", 353 "relevance": "Multi-agent framework for LLM applications, relevant to agentic AI architectures." 354 }, 355 { 356 "title": "Automated Design of Agentic Systems", 357 "authors": ["S. Hu", "C. Lu", "J. Clune"], 358 "year": 2024, 359 "arxiv_id": "2408.08435", 360 "relevance": "Automated search over agent architectures, directly relevant to the survey's coverage of agentic system design." 361 }, 362 { 363 "title": "Towards AI-Native Software Engineering (SE 3.0): A Vision and a Challenge Roadmap", 364 "authors": ["A. E. Hassan", "G. A. Oliva", "D. Lin"], 365 "year": 2024, 366 "arxiv_id": "2410.06107", 367 "relevance": "Foundational vision paper for SE 3.0 paradigm that Compiler.next builds upon." 368 }, 369 { 370 "title": "Large Language Models Are Human-Level Prompt Engineers", 371 "authors": ["Y. Zhou", "A. I. Muresanu", "Z. Han"], 372 "year": 2023, 373 "relevance": "Automatic Prompt Engineer (APE) — foundational work on automated prompt optimization using LLMs." 374 }, 375 { 376 "title": "Promptbreeder: Self-Referential Self-Improvement via Prompt Evolution", 377 "authors": ["C. Fernando", "D. S. Banarse", "H. Michalewski"], 378 "year": 2024, 379 "relevance": "Evolutionary prompt optimization with self-referential mutation, directly relevant to LLM prompt engineering methodology." 380 }, 381 { 382 "title": "Connecting Large Language Models with Evolutionary Algorithms Yields Powerful Prompt Optimizers", 383 "authors": ["Q. Guo", "R. Wang", "J. Guo"], 384 "year": 2024, 385 "relevance": "EvoPrompt — evolutionary approach to prompt optimization, comparable methodology to Compiler.next." 386 }, 387 { 388 "title": "Optimizing Generative AI by Backpropagating Language Model Feedback", 389 "authors": ["M. Yuksekgonul", "F. Bianchi", "J. Boen"], 390 "year": 2025, 391 "doi": "10.1038/s41586-025-08661-4", 392 "relevance": "TextGrad — gradient-descent metaphor for optimizing LLM pipelines, key competing approach." 393 }, 394 { 395 "title": "Rethinking Software Engineering in the Era of Foundation Models: A Curated Catalogue of Challenges in the Development of Trustworthy FMware", 396 "authors": ["A. E. Hassan", "D. Lin", "G. K. Rajbahadur"], 397 "year": 2024, 398 "doi": "10.1145/3663529.3663849", 399 "relevance": "Foundational work on challenges in FMware engineering that Compiler.next addresses." 400 } 401 ] 402 }