scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24670B)
      1 {
      2   "paper": {
      3     "title": "APRIL: API Synthesis with Automatic Prompt Optimization and Reinforcement Learning",
      4     "authors": [
      5       "Hua Zhong",
      6       "Shan Jiang",
      7       "Sarfraz Khurshid"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv preprint",
     11     "arxiv_id": "2509.25196"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. There is no mention of code availability."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The benchmark consists of 81 tasks from NumPy, scikit-learn, and SciPy, but no dataset download link or benchmark artifact is provided. The specific tasks are not enumerated or released."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions using Gemini 2.0 and Python libraries but does not specify library versions or dependency details sufficient to recreate the environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes the methodology at a high level but does not provide actionable steps to reproduce the experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results in Tables 1-3 report only point estimates (e.g., 93.8% pass rate). No confidence intervals, error bars, or uncertainty measures are provided."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims APRIL outperforms the baseline (93.8% vs. 77.8%) but provides no statistical significance test (e.g., McNemar's test, Fisher's exact test) to support the comparison."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports percentage improvements with baseline context: '16.6%, 14.9%, and 16.7% over the baseline on the three benchmarks' (Section 4.2), and both absolute rates are given (e.g., 77.8% baseline vs. 93.8% APRIL), providing enough context to judge magnitude."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The benchmark consists of 81 tasks (36 NumPy, 33 scikit-learn, 12 SciPy) with no justification for why these numbers were chosen or whether they are sufficient for the claims made. The SciPy subset is particularly small (12 tasks)."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. It is unclear whether experiments were run multiple times or only once. Results appear to be single-run numbers."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares APRIL against a baseline using the initial manually engineered prompt on the same Gemini 2.0 model (Table 2, Section 4.2)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The only baseline is the same Gemini 2.0 model with a manually engineered prompt. No comparison is made against other code generation tools, other LLMs (e.g., GPT-4, Claude, open-source models), or prior program synthesis systems like SyPet, FrAngel, or EdSynth that are discussed in related work."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The paper explicitly states: 'Due to time and space constraints, we do not conduct an ablation study isolating the individual contributions of APO and RLVR' (Section 4.2, after RQ2). This is a significant omission for a system with two main components."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports both executability rate (100%) and test pass rate (93.8%) as separate metrics in Table 1."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation of the synthesized APIs is included. Evaluation is entirely automated via test suites. For API synthesis, human evaluation of code quality, readability, or maintainability would be relevant given the paper's claims about producing 'maintainable, idiomatic' implementations."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The paper uses 40 of the 81 tasks for APO/RLVR training but does not clearly separate which tasks are training vs. evaluation. It is unclear whether the reported 93.8% pass rate is on held-out tasks only or includes training tasks. The paper does not explicitly state that the evaluation set is disjoint from the training set."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per library (NumPy, scikit-learn, SciPy) in Tables 1-3, showing per-benchmark performance rather than just aggregate numbers."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "The paper reports that 5 APIs failed validation tests (76 out of 81 passed) but does not analyze why these failed, what patterns characterize failures, or show examples of incorrect synthesized code."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No negative results are reported. Every configuration shown improves over the baseline. The paper mentions Figure 4 showing 'potential iterative improvement from APO on RLVR fine-tuned Model' but does not report any failed approaches or configurations that did not work."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims 'APRIL achieves substantial improvements' and 'a success rate exceeding 93.8%', both of which are supported by Tables 1 and 2. The abstract also claims this is 'a robust, scalable path' which is somewhat strong given the limited evaluation, but the core empirical claims are supported."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper claims that 'integrating APO and RLVR provides a robust, scalable path for component-based API synthesis' but without an ablation study, it cannot attribute improvements to either component. The causal mechanism behind the improvement (APO vs. RLVR vs. their combination) is not isolated."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper tests on 81 APIs from three specific Python scientific libraries but makes broad claims about 'component-based API synthesis in large libraries' and 'complex API synthesis' generally. The title and abstract do not bound the claims to scientific Python libraries or to Gemini 2.0 specifically."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations for the results are discussed. For example, the improvement could be due to overfitting the prompt or RLVR to the specific test suite format, or the baseline prompt could simply be suboptimal. None of these possibilities are considered."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper states 'a customized instance of Gemini 2.0' and 'Gemini 2.5 Pro' but provides no specific model version identifiers, snapshot dates, or API version strings. 'Gemini 2.0' is a marketing name without a precise version."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The manually engineered initial prompt is provided in full in Figure 2. It includes the actual prompt text with role conditioning, output format instructions, and template structure with placeholders whose fill values (method signature, module, library, test cases) are described."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 4.1 reports: 'sampling temperature is set to 0.7', 'input context window of 32,000 tokens', and 'output context window (maximum generation length) of 8,000 tokens'. RLVR uses 'stochastic decoding (temperature and top-p)' though specific RLVR hyperparameters (learning rate, epochs, K samples) are not fully specified."
    147       },
    148       "scaffolding_described": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The APO pipeline is described in detail in Section 3.3 (discriminator score, text gradient, beam search), the RLVR pipeline in Section 3.4 (GRPO, binary rewards, clipped surrogate with KL penalty), and the test generation loop in Algorithm 1. The workflow is also shown in Figure 1."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "The paper states 81 tasks from three libraries with 40 used for training, but does not document how these specific APIs were selected, what criteria were used for inclusion/exclusion, or how the training/evaluation split was determined."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (Section 6) does not discuss limitations either."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No specific threats to validity are discussed anywhere in the paper. There is no discussion of potential issues with the evaluation methodology, benchmark selection, or generalizability."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No explicit scope boundaries are stated. The paper does not clearly state what the results do NOT show or what settings are excluded. The only scoping statement is that the ablation study is not conducted 'due to time and space constraints.'"
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No raw data (synthesized code outputs, test suite contents, per-task results) is made available for independent verification."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper states 81 tasks from NumPy (36), scikit-learn (33), and SciPy (12) but does not describe how these specific APIs were selected from these large libraries, what criteria guided the selection, or why these particular counts."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants were involved; the study uses benchmark tasks from public Python libraries."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The pipeline from API selection to test generation to synthesis evaluation is described at a high level, but specifics are missing: how the 40-task training set was selected from the 81 tasks, whether any tasks were excluded during development, and what the complete evaluation pipeline looked like end-to-end."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No funding information or acknowledgments section is present in the paper."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Author affiliations are listed: all three authors are from The University of Texas at Austin. The paper evaluates Google's Gemini models, and the authors are not affiliated with Google based on the stated affiliations."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding is disclosed, so independence cannot be assessed. The paper heavily features Google's Gemini products (Gemini 2.0, Gemini 2.5 Pro, Gemini CLI) without disclosing whether Google provided any support (API credits, model access, etc.)."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests or financial interests statement is present in the paper."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "The paper uses Gemini 2.0 and Gemini 2.5 Pro to generate code for NumPy, SciPy, and scikit-learn APIs but does not state the training data cutoff dates for these models. The benchmark APIs exist in public libraries that are likely in the training data."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No discussion of whether the target APIs' implementations might already exist in the Gemini models' training data. Since the benchmark tasks are real existing APIs from well-known libraries, the reference implementations are publicly available and likely in the training corpus."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The benchmark uses real APIs from NumPy, scikit-learn, and SciPy — all widely-used open-source libraries whose code is publicly available and almost certainly in Gemini's training data. This is a critical contamination concern that is not addressed."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants involved in this study."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants involved in this study."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants involved in this study."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants involved in this study."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants involved in this study."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants involved in this study."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants involved in this study."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No inference cost, API cost, tokens consumed, or wall-clock time is reported. The approach involves multiple LLM calls (APO iterations, RLVR training, test generation) but no cost information is provided."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No computational budget is stated. RLVR fine-tuning and APO both require significant compute, but no GPU hours, API spend, or training time is reported."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "APRIL achieves a 93.8% success rate in synthesizing correct, test-passing APIs across 81 tasks.",
    290       "evidence": "Table 1 shows 76 out of 81 tasks pass validation tests: 35/36 NumPy, 30/33 scikit-learn, 11/12 SciPy (Section 4.2, RQ1).",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "APRIL substantially outperforms baseline LLMs using manually engineered prompts, with improvements of 16.6%, 14.9%, and 16.7% across three benchmarks.",
    295       "evidence": "Table 2 compares APRIL (93.8% overall) against baseline Gemini 2.0 with manual prompt (77.8% overall), showing improvements per benchmark (Section 4.2, RQ2).",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "All LLM-generated APIs built and executed successfully (100% executability).",
    300       "evidence": "Table 1 shows 81/81 executability across all benchmarks (Section 4.2, RQ1).",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Gemini-cli generates on average 8.1 tests per API and requires 2.2 iterations to converge to a comprehensive test suite.",
    305       "evidence": "Table 3 reports test counts and iteration counts per benchmark (Section 4.2, RQ3).",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Integrating APO and RLVR provides a robust, scalable path for component-based API synthesis.",
    310       "evidence": "Only the combined system is evaluated; no ablation separates APO from RLVR contributions. The 81-task benchmark is relatively small for 'scalable' claims. No ablation study is provided (Section 4.2).",
    311       "supported": "weak"
    312     }
    313   ],
    314   "methodology_tags": [
    315     "benchmark-eval"
    316   ],
    317   "key_findings": "APRIL combines Automatic Prompt Optimization and Reinforcement Learning from Verifiable Rewards to synthesize APIs from scientific Python libraries (NumPy, scikit-learn, SciPy). On 81 benchmark tasks, APRIL achieves a 93.8% test pass rate compared to 77.8% for a baseline using the same Gemini 2.0 model with a manually engineered prompt. The paper also demonstrates that Gemini CLI can generate comprehensive test suites in an average of 2.2 iterations per API. However, no ablation study separates the contributions of APO and RLVR, and significant contamination concerns exist since the benchmark APIs are from public libraries likely in the model's training data.",
    318   "red_flags": [
    319     {
    320       "flag": "Severe benchmark contamination risk",
    321       "detail": "The benchmark tasks are real APIs from NumPy, scikit-learn, and SciPy — all widely-used open-source libraries whose complete source code is publicly available and almost certainly in Gemini's training data. The model may be recalling implementations rather than synthesizing them. This is never acknowledged or addressed."
    322     },
    323     {
    324       "flag": "No ablation study",
    325       "detail": "The paper explicitly acknowledges it does not isolate the contributions of its two main components (APO and RLVR). Without this, the claimed synergy cannot be verified — all gains could come from one component alone."
    326     },
    327     {
    328       "flag": "Single baseline comparison",
    329       "detail": "The only baseline is the same Gemini 2.0 model with a manually engineered prompt. No comparison against other LLMs, other code generation tools, or prior program synthesis systems (SyPet, FrAngel, EdSynth) discussed in related work."
    330     },
    331     {
    332       "flag": "No statistical tests or uncertainty quantification",
    333       "detail": "Results are single-run point estimates with no confidence intervals, significance tests, or variance across runs. On a small benchmark (81 tasks, 12 for SciPy), sampling variation could be substantial."
    334     },
    335     {
    336       "flag": "No limitations section",
    337       "detail": "The paper contains no limitations section, no threats to validity, and no scope boundaries. This is a significant methodological omission."
    338     },
    339     {
    340       "flag": "Unclear train/test separation",
    341       "detail": "40 of 81 tasks are used for APO/RLVR training, but it is not clear whether the reported 93.8% pass rate is on held-out tasks only or includes training tasks. If training tasks are included, the reported performance is inflated."
    342     },
    343     {
    344       "flag": "Undisclosed relationship with Google products",
    345       "detail": "The paper heavily features Google's Gemini ecosystem (Gemini 2.0, Gemini 2.5 Pro, Gemini CLI) without disclosing whether Google provided any support, API credits, or early access. No funding or competing interests statement is present."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Automatic prompt optimization with 'gradient descent' and beam search",
    351       "authors": ["R. Pryzant", "D. Iter", "J. Li", "Y. Lee", "C. Zhu", "M. Zeng"],
    352       "year": 2023,
    353       "relevance": "Foundational APO method used in APRIL; relevant to understanding LLM prompt optimization techniques."
    354     },
    355     {
    356       "title": "Tulu 3: Pushing frontiers in open language model post-training",
    357       "authors": ["N. Lambert", "J. Morrison", "V. Pyatkin"],
    358       "year": 2025,
    359       "relevance": "Describes RLVR methodology adopted in APRIL for fine-tuning LLMs with verifiable rewards."
    360     },
    361     {
    362       "title": "Jigsaw: Large language models meet program synthesis",
    363       "authors": ["N. Jain", "S. Vaidyanath", "A. Iyer", "N. Natarajan", "S. Parthasarathy", "S. Rajamani", "R. Sharma"],
    364       "year": 2022,
    365       "relevance": "Prior work on using LLMs for program synthesis; directly related to LLM-based code generation evaluation."
    366     },
    367     {
    368       "title": "Quality and trust in LLM-generated code",
    369       "authors": ["C. Spiess", "D. Gros", "K. S. Pai", "M. Pradel"],
    370       "year": 2024,
    371       "arxiv_id": "2402.02047",
    372       "relevance": "Studies quality and trust issues in LLM-generated code, directly relevant to evaluating code generation reliability."
    373     },
    374     {
    375       "title": "Are human rules necessary? Generating reusable APIs with CoT reasoning and in-context learning",
    376       "authors": ["Y. Mai", "Z. Gao", "X. Hu", "L. Bao", "Y. Liu", "J. Sun"],
    377       "year": 2024,
    378       "relevance": "Prior work on using LLMs to generate reusable APIs, directly comparable to APRIL's approach."
    379     },
    380     {
    381       "title": "An approach for API synthesis using large language models",
    382       "authors": ["H. Zhong", "S. Jiang", "S. Khurshid"],
    383       "year": 2025,
    384       "arxiv_id": "2502.15246",
    385       "relevance": "Predecessor work by the same authors on LLM-based API synthesis."
    386     },
    387     {
    388       "title": "Can large language models transform natural language intent into formal method postconditions?",
    389       "authors": ["M. Endres", "S. Fakhoury", "S. Chakraborty", "S. K. Lahiri"],
    390       "year": 2024,
    391       "relevance": "Evaluates LLM capabilities for formal specification generation, relevant to code generation evaluation methodology."
    392     },
    393     {
    394       "title": "Fuzz4all: Universal fuzzing with large language models",
    395       "authors": ["C. S. Xia", "M. Paltenghi", "J. Le Tian", "M. Pradel", "L. Zhang"],
    396       "year": 2024,
    397       "relevance": "Uses LLMs for test generation (fuzzing), relevant to LLM-based testing and code quality evaluation."
    398     },
    399     {
    400       "title": "On the effectiveness of large language models in writing Alloy formulas",
    401       "authors": ["Y. Hong", "S. Jiang", "Y. Fu", "S. Khurshid"],
    402       "year": 2025,
    403       "arxiv_id": "2502.15441",
    404       "relevance": "Evaluates LLM effectiveness for formal language generation, relevant to LLM code generation capability assessment."
    405     },
    406     {
    407       "title": "CASCADE: LLM-powered JavaScript deobfuscator at Google",
    408       "authors": ["S. Jiang", "P. Kovuri", "D. Tao", "Z. Tan"],
    409       "year": 2025,
    410       "arxiv_id": "2507.17691",
    411       "relevance": "Industrial application of LLMs for code transformation at Google, relevant to practical LLM-based SE tools."
    412     },
    413     {
    414       "title": "Generating executable oracles to check conformance of client code to requirements of JDK Javadocs using LLMs",
    415       "authors": ["S. Jiang", "C. Zhu", "S. Khurshid"],
    416       "year": 2024,
    417       "arxiv_id": "2411.01789",
    418       "relevance": "Uses LLMs for test oracle generation from documentation, directly related to the test generation component of APRIL."
    419     }
    420   ]
    421 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs