scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19433B)
      1 {
      2   "paper": {
      3     "title": "Controlled Self-Evolution for Algorithmic Code Optimization",
      4     "authors": ["Tu Hu", "Ronghao Chen", "Shuo Zhang", "Jianghao Yin", "Mou Xiao Feng", "Jingping Liu", "Shaolei Zhang", "Andy Wang", "Wenqi Jiang", "Yuqi Fang", "Sen Hu", "Huacan Wang", "Yi Xu"],
      5     "year": 2026,
      6     "arxiv_id": "2601.07348"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": true,
     13         "justification": "The abstract states 'Our code is publicly available at https://github.com/QuantaAlpha/EvoControl.'"
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "They evaluate on EffiBench-X, a publicly available benchmark (Qing et al., 2025). No proprietary data was collected."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No requirements.txt, Dockerfile, or detailed environment setup with library versions is described in the paper."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions are provided in the paper. Implementation details are given (Appendix C) but no README-level commands or scripts to replicate experiments."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "All results in Table 1 are point estimates with no confidence intervals or error bars."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper claims CSE outperforms all baselines but provides no statistical significance tests. Comparisons are based solely on comparing numbers."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Only raw percentage scores are reported. No effect sizes (Cohen's d, etc.) are provided. While baseline context exists (e.g., Direct vs. CSE numbers), no formal effect size measures are used."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The benchmark has 623 problems but no justification is given for why this sample size is adequate for the claims made."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Appendix C mentions 5 repeated runs per candidate with outlier removal for the reward signal, but no variance or standard deviation is reported for the final metrics across runs or problems in Table 1."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Four baselines are compared: Direct, Self-Reflection, SE-Agent, and AlphaEvolve (Table 1)."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Baselines include AlphaEvolve (Novikov et al., 2025) and SE-Agent (Lin et al., 2025), which are recent and competitive methods."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 2 presents ablation removing each of the three components (Planning, Evolution, Memory). Figure 4 further analyzes Memory's synergy with other components."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Three metrics are reported: Execution Time (ET), Memory Peak (MP), and Memory Integral (MI)."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No human evaluation of code quality is included. All evaluation is automated via test cases and efficiency metrics."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "EffiBench-X provides comprehensive test cases per problem (100 test cases with time/memory limits per Appendix C). The benchmark itself serves as the held-out evaluation."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down by programming language (Python vs. C++) and by metric (ET, MP, MI) in Table 1."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 5.3 discusses evolution dynamics including failure lessons in local memory (Section 4.3). The case study in Section 5.4 shows the full evolution trajectory including failures."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Figure 4 shows that adding Memory to Planning alone provides negligible benefit (delta MI = -0.23), an honest negative result about component interactions."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims CSE 'consistently outperforms all baselines across various LLM backbones' and 'achieves higher efficiency from early generations.' Table 1 and Figure 3 support these claims."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Causal claims like 'CSE's effectiveness arises from the synergistic interplay of all three components' are supported by controlled ablation studies (Table 2) where single components are removed."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title 'Algorithmic Code Optimization' is reasonably scoped, but results are only on EffiBench-X (competitive programming). The paper does not bound generalization to this domain — it implies the framework works for code optimization generally."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No alternative explanations for the results are discussed. The improvements could partly be due to the increased effective prompt length from memory injection, but this is not considered."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper uses 'DeepSeek-V3-0324', 'Qwen3-235B-A22B', 'Claude-4.5-Sonnet', and 'GPT-5'. DeepSeek-V3-0324 includes a date suffix. However, Claude-4.5-Sonnet and GPT-5 lack snapshot dates or API versions."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper describes prompt strategies conceptually but does not provide the actual prompt text used for planning, mutation, crossover, or reflection. Appendix C mentions a 'decomposition template' (Figure 9) but the full prompts are not shown in the paper text provided."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Appendix C reports: T=30 iterations, Ninit=5 plans, Nq=3 queries, Km=3 retrievals, epsilon=0.001, k=5 clipping, alternating mutation/crossover schedule, and embedding model (Qwen3-8B-Embedding)."
    142       },
    143       "scaffolding_described": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The agentic scaffolding is described in detail: planning initialization (Section 4.1), genetic evolution with mutation and crossover (Section 4.2), hierarchical memory with local/global components and retrieval (Section 4.3), and Algorithm 1 provides the full workflow."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix C documents the evaluation protocol: efficiency measured only on tasks solved by Direct, fallback to Direct baseline, 100 test cases per problem, 5 repeated runs with outlier removal."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "A dedicated 'Limitations' section appears after Section 6 (Conclusion), discussing the lack of amortization into the base model."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The limitations section only discusses one future direction (distilling trajectories into RL training). It does not discuss specific threats to validity of the current results."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. No discussion of what settings, tasks, or domains are excluded from the claims."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw experimental data (per-problem scores, evolution trajectories) is released. Only aggregate results in tables."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "EffiBench-X aggregates 623 problems from AtCoder, Codeforces, and LeetCode (Section 5.1). The benchmark source and evaluation protocol are described."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants. The study uses automated benchmarks only."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix C documents the full pipeline: problem selection from EffiBench-X, Direct baseline solving, efficiency measurement only on solved tasks, fallback protocol, 5-run averaging with outlier removal."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding or acknowledgments section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are listed: NJU, PKU, Midea-AIRC, ECNU, SYSU, RUC, QuantaAlpha, QuantML."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding is disclosed, so independence cannot be assessed. Authors from Midea-AIRC and QuantaAlpha (industry affiliations) could have commercial interests."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests statement is present. Authors are affiliated with QuantaAlpha and QuantML which may have commercial interests in code optimization."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "The paper evaluates GPT-5, Claude-4.5-Sonnet, DeepSeek-V3, and Qwen3 on EffiBench-X problems sourced from competitive programming platforms, but states no training cutoff dates for any model."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "EffiBench-X problems come from AtCoder, Codeforces, and LeetCode — publicly available sources likely in model training data. No discussion of potential overlap."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Competitive programming problems from public platforms are likely in training data of all evaluated models. This contamination risk is not discussed."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants in this study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants in this study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "The method runs 30 LLM calls per task across 623 problems with 4 models, but no API costs, token counts, or wall-clock time is reported."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No total computational budget, GPU hours, or API spend is stated despite significant compute requirements (4 models x 623 problems x 30 iterations)."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "CSE consistently outperforms all baselines across various LLM backbones on EffiBench-X.",
    285       "evidence": "Table 1 shows CSE achieves highest average MI across all four models (Qwen3: 50.17%, DeepSeek-V3: 55.09%, Claude-4.5-Sonnet: 74.41%, GPT-5: 67.47%).",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "CSE achieves higher efficiency from early generations and maintains continuous improvement throughout evolution.",
    290       "evidence": "Figure 3 shows best-so-far MI curves. Table 3 shows CSE has more improvement events (#Imp.=1.79) and later Iter@Best (12.06) than baselines.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "All three components (Planning, Evolution, Memory) are essential, with Memory showing the largest impact.",
    295       "evidence": "Table 2 ablation: removing Memory drops MI from 68.10% to 63.08% (largest drop). Figure 4 shows Memory's gain is context-dependent.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Memory amplifies controlled evolutionary processes rather than providing universal improvement.",
    300       "evidence": "Figure 4 shows Memory + Planning alone gives delta MI = -0.23, but Memory + Evolution gives +2.68, and all three gives +5.02.",
    301       "supported": "strong"
    302     }
    303   ],
    304   "methodology_tags": ["benchmark-eval"],
    305   "key_findings": "CSE proposes three mechanisms (diversified planning initialization, genetic evolution with controlled mutation/crossover, and hierarchical evolution memory) to improve exploration efficiency in LLM-based code optimization. On EffiBench-X across four LLM backbones and two programming languages, CSE outperforms baselines including AlphaEvolve and SE-Agent on efficiency metrics. Ablation studies show all three components contribute, with memory providing the largest impact but only when combined with evolution, demonstrating a synergistic rather than additive effect.",
    306   "red_flags": [
    307     {
    308       "flag": "No statistical significance testing",
    309       "detail": "All comparative claims are based on comparing point estimates without any significance tests, confidence intervals, or variance reporting. Margins between CSE and baselines are often small (e.g., 0.36% ET difference for Qwen3 Python)."
    310     },
    311     {
    312       "flag": "Benchmark contamination risk unaddressed",
    313       "detail": "EffiBench-X problems come from AtCoder, Codeforces, and LeetCode — public platforms almost certainly in the training data of GPT-5, Claude-4.5-Sonnet, etc. This is not discussed."
    314     },
    315     {
    316       "flag": "No cost reporting despite heavy compute",
    317       "detail": "Running 4 LLMs x 623 problems x 30 iterations represents substantial compute. No costs, token counts, or wall-clock times are reported, making practical applicability impossible to assess."
    318     },
    319     {
    320       "flag": "Industry affiliations without conflict disclosure",
    321       "detail": "Authors are affiliated with Midea-AIRC, QuantaAlpha, and QuantML (industry entities) but no conflicts of interest or funding sources are disclosed."
    322     }
    323   ],
    324   "cited_papers": [
    325     {
    326       "title": "AlphaEvolve: A coding agent for scientific and algorithmic discovery",
    327       "authors": ["Alexander Novikov"],
    328       "year": 2025,
    329       "arxiv_id": "2506.13131",
    330       "relevance": "Key baseline; represents population-based evolutionary code optimization with LLMs."
    331     },
    332     {
    333       "title": "SE-Agent: Self-Evolution Trajectory Optimization in Multi-Step Reasoning with LLM-Based Agents",
    334       "authors": ["Jiaye Lin"],
    335       "year": 2025,
    336       "arxiv_id": "2508.02085",
    337       "relevance": "Key baseline; trajectory-level self-evolving agent for code generation."
    338     },
    339     {
    340       "title": "AfterBurner: Reinforcement Learning Facilitates Self-Improving Code Efficiency Optimization",
    341       "authors": ["Mingzhe Du"],
    342       "year": 2025,
    343       "arxiv_id": "2505.23387",
    344       "relevance": "Self-reflection baseline for code efficiency optimization."
    345     },
    346     {
    347       "title": "EffiBench-X: A Multi-Language Benchmark for Measuring Efficiency of LLM-Generated Code",
    348       "authors": ["Yuhao Qing"],
    349       "year": 2025,
    350       "arxiv_id": "2505.13004",
    351       "relevance": "The primary evaluation benchmark used; measures code efficiency across languages."
    352     },
    353     {
    354       "title": "EffiLearner: Enhancing Efficiency of Generated Code via Self-Optimization",
    355       "authors": ["Dong Huang"],
    356       "year": 2024,
    357       "relevance": "Prior self-optimization method for code efficiency; evaluation protocol adopted from this work."
    358     },
    359     {
    360       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    361       "authors": ["Aman Madaan"],
    362       "year": 2023,
    363       "relevance": "Foundational self-refinement method for LLM outputs."
    364     },
    365     {
    366       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    367       "authors": ["Noah Shinn"],
    368       "year": 2023,
    369       "relevance": "Self-reflection framework for LLM agents using verbal feedback."
    370     },
    371     {
    372       "title": "Mathematical discoveries from program search with large language models",
    373       "authors": ["Bernardino Romera-Paredes"],
    374       "year": 2023,
    375       "relevance": "FunSearch; foundational work on LLM-driven evolutionary program search."
    376     },
    377     {
    378       "title": "Evaluating large language models trained on code",
    379       "authors": ["Mark Chen"],
    380       "year": 2021,
    381       "arxiv_id": "2107.03374",
    382       "relevance": "HumanEval benchmark paper; foundational LLM code generation evaluation."
    383     },
    384     {
    385       "title": "A survey on code generation with LLM-based agents",
    386       "authors": ["Yihong Dong"],
    387       "year": 2025,
    388       "arxiv_id": "2508.00083",
    389       "relevance": "Survey of LLM-based code generation agents relevant to the survey scope."
    390     }
    391   ]
    392 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs