scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25395B)
      1 {
      2   "paper": {
      3     "title": "CodePDE: An Inference Framework for LLM-driven PDE Solver Generation",
      4     "authors": [
      5       "Shanda Li",
      6       "Tanya Marwah",
      7       "Junhong Shen",
      8       "Weiwei Sun",
      9       "Andrej Risteski",
     10       "Yiming Yang",
     11       "Ameet Talwalkar"
     12     ],
     13     "year": 2026,
     14     "venue": "Transactions on Machine Learning Research",
     15     "arxiv_id": "2505.08783"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper provides a GitHub link (https://github.com/LithiumDA/CodePDE) in Section 3 and the abstract. Evaluation data is released on HuggingFace (https://huggingface.co/datasets/LDA1020/codepde-data/tree/main)."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Datasets are drawn from PDEBench (Takamoto et al., 2022) and FNO paper (Li et al., 2022b), both with MIT license. The evaluation data is also released on HuggingFace."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions the framework is 'implemented in Python' and that evaluation uses an NVIDIA GeForce RTX 2080 Ti GPU (Appendix B.3), but no requirements.txt, Dockerfile, or detailed dependency specifications are provided in the paper itself."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released on GitHub, the paper itself does not contain a 'Reproducing Results' section or specific commands to replicate the experiments."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Main results in Tables 1, 3, and 4 report only point estimates of nRMSE. No confidence intervals or error bars are provided for the primary accuracy metrics. The test-time scaling curves in Figure 3 appear smoothed but do not show uncertainty bands."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes numerous comparative claims (e.g., 'CodePDE with refinement outperforms the hand-crafted reference solvers on 4 out of 5 evaluated tasks') but provides no statistical significance tests. Differences are assessed by direct numerical comparison only."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper reports effect sizes in context, e.g., 'Gemini 2.0 Flash achieves an nRMSE of 1.06×10^-4 via refinement, compared to the reference's 3.55×10^-4, representing a 70% improvement in accuracy' (Section 5.1). Bug-free rates are given with baseline context (41% to 84%)."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper uses 100 instances per PDE family and 32 i.i.d. samples per model per PDE for code generation (Appendix B.4), but provides no justification for why these numbers are sufficient or any power analysis."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Results are reported as best-of-n samples (best of 32 or best of 12) rather than as averages with standard deviations. No variance or standard deviation across runs is reported for the main nRMSE results."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares against reference numerical solvers, PDE solving softwares (Dedalus, PyPDE), neural network baselines (U-Net, FNO, PINN, ORCA, PDEformer, UPS), and agentic workflows (FunSearch, AIDE). See Table 1."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include contemporary models and methods: UPS (2024b), ORCA (2023), PDEformer (2024), FunSearch (2024), and AIDE (2025). The LLM baselines span 16 models including the latest reasoning models (o3, DeepSeek-R1, Gemini 2.5 Pro)."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Table 2 presents an ablation study removing components progressively: CodePDE full → w/o refine → w/o refine+scale → w/o refine+scale+debug. Each component's contribution is quantified."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper uses four metrics: nRMSE (primary accuracy), debug success rate, convergence rate (with order classification), and execution time. See Section 4 (Evaluation Metrics)."
     87       },
     88       "human_evaluation": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "Human evaluation is not applicable here. The paper evaluates automatically generated PDE solvers against ground truth numerical solutions. The quality of solvers is objectively measurable via nRMSE against reference solutions."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Appendix B.3 explicitly states: 'We randomly sample 100 instances for each family for testing. We also sample another 50 instances as the development set, which is used to provide the execution feedback to LLMs without leaking the true test data.'"
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by PDE family (Advection, Burgers, Reaction-Diffusion, CNS, Darcy) and by model in Tables 1, 3, 4. Bug-free rates are broken down per model and per PDE in Figure 2 and Tables 5-6."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 5.7 provides detailed failure case analysis on the Reaction-Diffusion Equation, explaining why LLM-generated solvers fail (they use finite-difference for the reaction term instead of the analytical solution). Table 9 and Appendix D provide code-level comparisons."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports that all LLMs fail to match the reference solver on Reaction-Diffusion (Section 5.1, 5.7). It also notes that reasoning models are 'not necessarily better than standard ones in the refinement stage' (Section 5.3), an unexpected negative finding."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims LLMs 'can achieve strong performance across a range of representative PDE problems' with advanced inference strategies. Table 1 supports this, showing CodePDE outperforms reference solvers on 4/5 tasks. The abstract also mentions 'trade-offs between solver reliability and sophistication' which is supported by the convergence analysis in Figure 4."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper makes causal claims through ablation studies (Table 2), which use controlled single-variable manipulation. Removing components individually (refinement, scaling, debugging) demonstrates their causal contribution to performance. The ablation design is adequate."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title 'LLM-driven PDE Solver Generation' and abstract claims about 'PDE solving' are broader than what is tested. The paper evaluates only 5 PDE families (all standard benchmarks), but claims like 'LLMs can generate effective and efficient solver code for PDEs' are stated broadly. The conclusions refer to 'scientific computing' generally."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper does not discuss alternative explanations for its results. For example, the best-of-n selection strategy could mean the improvements are driven by selection pressure rather than LLM capability. The role of training data contamination (LLMs may have seen PDE solver code during training) is not discussed as an alternative explanation."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper refers to models by marketing names: 'GPT-4o', 'Claude-3.7-sonnet', 'Gemini 2.5 Pro', 'o3', etc. No API version, snapshot date, or specific model ID (e.g., 'gpt-4o-2024-05-13') is provided. Appendix Table 3 references models by the same marketing names."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompts are provided in Appendix B: the system prompt (B.2), PDE descriptions used as task specifications (B.1), debugging prompt template with {code_output} and {error_message} placeholders, and refinement prompt template. The actual PDE descriptions that fill the templates are fully specified."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix B.4 reports decoding temperature (0.7 for most models, 1.0 for OpenAI o-series), sample count (32 for generation, 12 for refinement), debug iteration limit (4), and FunSearch/AIDE-specific parameters (island count, reset period, search steps)."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The CodePDE framework is described in detail in Section 3 with a 5-step pipeline: task specification, code generation, debugging (with iterative error feedback), evaluation, and solver refinement. The agentic workflow is fully described with prompts in Appendix B."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix B.3 documents data handling: 100 instances sampled per PDE family for testing, 50 for development. Data sources (PDEBench, FNO paper) are specified with licenses. The separation between dev and test sets is clearly documented."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The Conclusions section (Section 6) is brief and does not discuss limitations substantively."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No specific threats to validity are discussed. The paper does not address issues like: whether LLMs may have seen PDE solver code in training data, whether the 5 chosen PDEs are representative, or whether best-of-n selection inflates apparent capability."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to the specific PDEs tested, the specific hardware used, or the particular model versions evaluated. The conclusions make broad claims about 'LLMs and agents for scientific computing' without scoping."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Evaluation data is released on HuggingFace (https://huggingface.co/datasets/LDA1020/codepde-data/tree/main) and code on GitHub. The underlying PDE benchmark datasets (PDEBench, FNO) are publicly available."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4 and Appendix B.3 describe data collection: datasets drawn from PDEBench and FNO paper (both publicly available with MIT license), 100 instances randomly sampled per family for testing, 50 for development."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data source is standard public benchmarks (PDEBench, FNO)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is documented: PDE task specification → LLM code generation (32 samples) → debugging (up to 4 rounds) → evaluation against reference solutions → optional refinement (12 samples from top 5 seeds). Appendix B.3-B.4 provide full details."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Acknowledgments section states: 'This work was supported in part by the National Science Foundation grants IIS1705121, IIS1838017, IIS2046613, IIS2112471, and funding from Datadog.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are listed: Carnegie Mellon University, Flatiron Institute, Polymathic AI, and Datadog (for Ameet Talwalkar). The paper evaluates multiple LLMs from various providers without the authors being affiliated with those providers."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funding comes from NSF grants and Datadog. Neither funder has a direct stake in whether LLMs can generate PDE solvers. The paper evaluates multiple third-party LLMs without favoring any particular provider."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper. Ameet Talwalkar is affiliated with Datadog, which provides funding, but no competing interests statement is included."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper evaluates 16 LLMs on code generation benchmarks but does not state the training data cutoff for any of the models. This is important because LLMs may have seen PDE solver code (including from PDEBench implementations) during training."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether the LLMs may have seen PDE solver implementations from PDEBench or similar public codebases during training. This is a significant concern since PDEBench code is publicly available on GitHub."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "PDEBench was published in 2022 and its code is publicly available. All evaluated models were trained after 2022 and likely had access to PDE solver implementations. This contamination risk is not discussed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Appendix A.7 reports token counts across PDE benchmarks for generation, debugging, and refinement stages. It states 'solving a single PDE using the full CodePDE framework generally incurs a total cost of less than $5 USD when using API-based services.' Execution times are also reported in Appendix A.5."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "The paper reports hardware (NVIDIA GeForce RTX 2080 Ti GPU, 11GB memory) in Appendix B.3, execution time analysis in Appendix A.5 (Figure 11), and token usage analysis in Appendix A.7 (Figures 14-16). Runtime limits (1200s and 2400s) are specified."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "CodePDE with refinement outperforms hand-crafted reference solvers on 4 out of 5 PDE families.",
    294       "evidence": "Table 1 shows that after refinement, LLM-generated solvers achieve lower nRMSE than reference solvers on Advection, Burgers, CNS, and Darcy Flow. Only Reaction-Diffusion remains worse.",
    295       "supported": "strong"
    296     },
    297     {
    298       "claim": "Self-debugging raises the average bug-free rate from 41% to 84% across all PDEs.",
    299       "evidence": "Figure 2 and Tables 5-6 in the appendix provide detailed bug-free rates before and after debugging for each model and PDE family. The average across 11 LLMs goes from approximately 41% to 84%.",
    300       "supported": "strong"
    301     },
    302     {
    303       "claim": "Test-time scaling (best-of-n) improves solution quality, with most significant gains between n=4 and n=16.",
    304       "evidence": "Figure 3 shows smoothed test-time scaling curves for representative models across PDE families. The curves show diminishing returns after n=16.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "Advanced reasoning models excel at generation but are not necessarily better at refinement than standard models.",
    309       "evidence": "Table 1 shows o3 and DeepSeek-R1 lead in 'Reasoning + Debugging' but GPT-4o and DeepSeek-V3 are competitive or better in 'Reasoning + Debugging + Refinement'. Section 5.3 discusses this finding.",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "The combination of debugging, refinement, and test-time scaling is essential; without them, naive prompting yields nRMSE of 1.49×10^-1.",
    314       "evidence": "Table 2 ablation study with Claude-3.7-sonnet shows progressive degradation: full CodePDE (4.44×10^-3) → w/o refine (5.15×10^-3) → w/o refine+scale (8.68×10^-2) → w/o all (1.49×10^-1).",
    315       "supported": "strong"
    316     },
    317     {
    318       "claim": "LLMs consistently fail on Reaction-Diffusion because they use finite-difference for the reaction term instead of the analytical solution.",
    319       "evidence": "Section 5.7 and Table 9 in Appendix D compare LLM-generated solvers against the reference, showing LLMs discretize the reaction term while the reference uses the analytical solution.",
    320       "supported": "strong"
    321     }
    322   ],
    323   "methodology_tags": [
    324     "benchmark-eval"
    325   ],
    326   "key_findings": "CodePDE demonstrates that LLMs paired with structured inference frameworks (debugging, refinement, test-time scaling) can generate PDE solvers competitive with hand-crafted numerical implementations on 4 of 5 benchmark PDE families. Self-debugging raises bug-free rates from 41% to 84%, and best-of-n sampling shows clear test-time scaling benefits. The paper identifies a reliability-sophistication trade-off where some models favor simple robust methods while others explore diverse higher-order schemes, and reveals that code generation and code refinement may be distinct LLM skills.",
    327   "red_flags": [
    328     {
    329       "flag": "Best-of-n selection inflates reported performance",
    330       "detail": "Main results use best-of-32 (generation) and best-of-12 (refinement) selection. This reports the performance ceiling rather than expected performance. No average or median nRMSE is reported, making it difficult to assess typical performance a practitioner would experience."
    331     },
    332     {
    333       "flag": "No uncertainty quantification on main results",
    334       "detail": "All nRMSE values in Tables 1, 3, 4 are point estimates without confidence intervals, standard deviations, or error bars. With stochastic sampling (temperature > 0), results will vary across runs, but this variance is never reported."
    335     },
    336     {
    337       "flag": "Training data contamination not addressed",
    338       "detail": "PDEBench code and PDE solver implementations are publicly available on GitHub. All 16 evaluated LLMs were trained after PDEBench's 2022 release. The LLMs may have memorized PDE solver patterns from training data, which would inflate performance relative to truly novel scientific problems."
    339     },
    340     {
    341       "flag": "No limitations section",
    342       "detail": "The paper lacks a limitations or threats-to-validity section. Significant concerns (contamination, generalization to harder PDEs, best-of-n inflation, cost at scale) are not discussed."
    343     },
    344     {
    345       "flag": "Overgeneralized conclusions",
    346       "detail": "The conclusions claim insights for 'developing more capable LLMs and agents for scientific computing' based on only 5 standard PDE families. These are well-studied equations with known numerical methods; the results may not extend to novel or complex multi-physics problems."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    352       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    353       "year": 2023,
    354       "relevance": "Key inference-time technique for iterative code improvement used in CodePDE's refinement stage."
    355     },
    356     {
    357       "title": "Teaching Large Language Models to Self-Debug",
    358       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"],
    359       "year": 2024,
    360       "relevance": "Foundational work on LLM self-debugging capability, a core component of CodePDE's debugging pipeline."
    361     },
    362     {
    363       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Parameters for Reasoning",
    364       "authors": ["Charlie Victor Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    365       "year": 2025,
    366       "relevance": "Motivates the test-time scaling strategy (best-of-n sampling) used in CodePDE's evaluation framework."
    367     },
    368     {
    369       "title": "Mathematical Discoveries from Program Search with Large Language Models",
    370       "authors": ["Bernardino Romera-Paredes"],
    371       "year": 2024,
    372       "relevance": "FunSearch is used as a baseline agentic workflow for comparison with CodePDE."
    373     },
    374     {
    375       "title": "AIDE: AI-Driven Exploration in the Space of Code",
    376       "authors": ["Zhengyao Jiang", "Dominik Schmidt"],
    377       "year": 2025,
    378       "relevance": "AIDE is used as a baseline agentic workflow featuring tree search for code generation comparison."
    379     },
    380     {
    381       "title": "MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering",
    382       "authors": ["Jun Shern Chan", "Neil Chowdhury", "Oliver Jaffe"],
    383       "year": 2025,
    384       "relevance": "Benchmark for evaluating LLM agents on ML engineering tasks, related to code generation capability evaluation."
    385     },
    386     {
    387       "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    388       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich"],
    389       "year": 2024,
    390       "relevance": "Prior work on repeated sampling as a test-time compute scaling strategy for LLMs."
    391     },
    392     {
    393       "title": "From Decoding to Meta-Generation: Inference-time Algorithms for Large Language Models",
    394       "authors": ["Sean Welleck", "Amanda Bertsch", "Matthew Finlayson"],
    395       "year": 2024,
    396       "relevance": "Survey of inference-time algorithms for LLMs that motivates CodePDE's framework design."
    397     },
    398     {
    399       "title": "Terminal-Bench: Benchmarking Agents on Hard, Realistic Tasks in Command Line Interfaces",
    400       "authors": ["Mike A Merrill", "Alexander Glenn Shaw", "Nicholas Carlini"],
    401       "year": 2026,
    402       "relevance": "CodePDE has been integrated into Terminal-Bench via Harbor Adapters, demonstrating practical deployment of the framework."
    403     },
    404     {
    405       "title": "PDEBench: An Extensive Benchmark for Scientific Machine Learning",
    406       "authors": ["Makoto Takamoto", "Timothy Praditia"],
    407       "year": 2022,
    408       "relevance": "Primary benchmark dataset source for CodePDE's evaluation; defines nRMSE metric used throughout."
    409     },
    410     {
    411       "title": "Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for LLM Problem-Solving",
    412       "authors": ["Yangzhen Wu", "Zhiqing Sun", "Shanda Li"],
    413       "year": 2025,
    414       "relevance": "Empirical analysis of inference compute scaling that informs CodePDE's test-time scaling strategy."
    415     }
    416   ]
    417 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs