ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (22025B)


      1 {
      2   "paper": {
      3     "title": "AI for NONMEM Coding in Pharmacometrics Research and Education: Shortcut or Pitfall?",
      4     "authors": [
      5       "Wenhao Zheng",
      6       "Wanbing Wang",
      7       "Carl M.J. Kirkpatrick",
      8       "Cornelia B. Landersdorfer",
      9       "Huaxiu Yao",
     10       "Jiawei Zhou"
     11     ],
     12     "year": 2025,
     13     "venue": "CPT: Pharmacometrics & Systems Pharmacology",
     14     "arxiv_id": "2507.08144",
     15     "doi": "10.1002/psp4.70125"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper provides a GitHub link: https://github.com/zhoujw14/AI_PMx stated as 'Supplementary Materials can be found https://github.com/zhoujw14/AI_PMx'."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The 13 NONMEM coding tasks (prompts) and all AI agent responses are provided in the Supplementary Materials via the GitHub repository. The tasks themselves constitute the evaluation data."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No environment or dependency specifications are provided. The paper does not describe API versions, library versions, or any reproducibility setup beyond naming the models used."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions are provided. The supplementary materials contain prompts and responses but there is no README or instructions describing how to replicate the evaluation."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper reports median and IQR for scores across 13 tasks but does not provide confidence intervals or error bars. IQR describes data spread but is not a measure of uncertainty around an estimate."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims o1 'demonstrated the best overall performance' and that the optimized prompt 'enhanced coding accuracy across all AI agents' but provides no statistical significance tests to support these comparative claims."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No effect sizes are reported. The paper compares median scores but does not provide standardized effect sizes or percentage improvements with baseline context for the optimized prompt improvement."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The sample of 13 tasks is not justified. There is no discussion of whether 13 tasks is sufficient to draw reliable conclusions about AI agent performance, no power analysis, and no acknowledgment that the small number limits statistical inference."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance or standard deviation is reported. The paper reports medians and IQR across the 13 tasks but these reflect task-level variation in a single run per model-task pair. Each model-task combination appears to be run once with no repeated runs, so there is no measure of run-to-run variance."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Seven AI agents are compared against each other, and the original prompt serves as a baseline for the optimized prompt condition. Prior work (refs 5-7) evaluating ChatGPT and Gemini is referenced as comparison context."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The models evaluated (gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, gpt-4o, gpt-4o-mini, o1, o3-mini) are all recent OpenAI models as of 2025."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "The optimized prompt is compared to the original prompt, but there is no ablation study examining which components of the optimized prompt contribute to improvement. The prompt embeds multiple rubric criteria and no individual component analysis is performed."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Only a single metric is used: the 3-level scoring rubric with a maximum of 6 points. No additional metrics such as syntactic validity, compilation success rate, or execution success are reported."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The scoring rubric is applied through expert evaluation. The rubric was 'refined and validated through expert consensus' and the codes are scored by domain experts against the rubric criteria."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "There is no separation of tasks into development and test sets. The same 13 tasks are used both for developing the rubric and evaluating final performance. The rubric was 'developed iteratively using several preliminary test tasks' but the final evaluation uses all 13 tasks without a held-out split."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Figure 2 shows per-task performance for each AI agent comparing original vs. optimized prompt. Individual task scores are available, not just aggregated averages."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The Impact Assessment section discusses specific failure modes: 'AI agents may fail to implement differential equations correctly or misapply model compartments. They may also struggle with proper initialization of model parameters or aligning dosing/event records in the dataset.'"
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports that several models performed poorly (e.g., gpt-4.1-nano and gpt-4o-mini had lower scores), and notes limitations of AI agents for complex models. The paper honestly states that 'current AI agents show limitations' for complex modeling tasks."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims o1 and gpt-4.1 achieved the best performance and perfect scores with the optimized prompt. These claims are supported by the results in Figure 1 and the reported scores (median 5, IQR 2.5-6 for o1; perfect 6.0 with optimized prompt for both)."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes causal claims about the optimized prompt improving performance ('significantly improved code accuracy', 'enhanced coding accuracy across all AI agents'). However, the study design does not control for confounds such as prompt length, specificity, or order effects, and each condition is run only once per model-task pair."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title and abstract frame the work broadly ('AI for NONMEM Coding') but only 7 OpenAI models are tested. No non-OpenAI models (Claude, Gemini, Llama, etc.) are evaluated despite being commonly available. The paper does not explicitly bound its conclusions to OpenAI models only."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No alternative explanations are discussed for the observed performance differences. For example, the improvement from the optimized prompt could be due to prompt length, added domain context, or specific formatting rather than the rubric criteria. No such alternatives are considered."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper lists model marketing names (gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, gpt-4o, gpt-4o-mini, o1, o3-mini) but does not specify exact API versions, snapshot dates, or version identifiers (e.g., 'gpt-4.1-2025-04-14'). Marketing names without snapshot dates do not count as specified versions."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper states that 'Responses generated by all seven AI agents, as well as their responses using the optimized prompt, are provided in the Supplementary Materials' at the GitHub repository. The optimized prompt is explicitly said to be 'provided in the Supplementary Materials.'"
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No hyperparameters such as temperature, top-p, max tokens, or other API settings are reported. These significantly affect LLM output quality."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The paper sends single prompts to AI models and evaluates the responses directly."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The 13 tasks are clearly enumerated in Table 1 with exact task descriptions. The scoring rubric development process is described: 'developed iteratively using several preliminary test tasks and was informed by common AI-generated NONMEM coding errors... refined and validated through expert consensus.'"
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations or threats-to-validity section. The paper acknowledges some limitations of AI agents in the Impact Assessment section but does not discuss the study's own methodological limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No specific threats to validity of this study are discussed. The paper discusses limitations of AI agents themselves but not limitations of the evaluation methodology (e.g., small task set, single vendor, no inter-rater reliability for rubric scoring, single-run evaluation)."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its conclusions to OpenAI models, does not acknowledge that 13 tasks may be insufficient for generalizable claims, and does not state that results may not transfer to other LLM providers."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The GitHub repository (https://github.com/zhoujw14/AI_PMx) contains the AI-generated responses for all tasks, which constitutes the raw data of this study."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The data collection procedure is described: 13 coding tasks were defined (Table 1), each sent to 7 AI agents, with both original and optimized prompts. The scoring rubric (Table 2) is provided."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants were recruited. The study evaluates AI model outputs on defined tasks."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The pipeline from collecting AI responses to scoring them is not fully documented. It is unclear who scored the responses, whether multiple scorers were used, or how disagreements (if any) were resolved. The scoring process lacks transparency."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Acknowledgements section states: 'This work is funded by a PharmAlliance Early Career Research Award and University of North Carolina at Chapel Hill.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed. The authors are from UNC Chapel Hill and Monash University. They are evaluating OpenAI products and have no apparent affiliation with OpenAI."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The funders (PharmAlliance and UNC Chapel Hill) are academic institutions with no apparent financial interest in whether OpenAI models perform well at NONMEM coding."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The paper explicitly states: 'The authors report no conflicts of interest.'"
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper does not state the training data cutoff dates for any of the 7 models evaluated. This is relevant because NONMEM coding examples may exist in the training data."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether the 13 NONMEM coding tasks or similar tasks appeared in the models' training data. NONMEM coding examples are publicly available online and could be in the training sets."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No discussion of benchmark contamination. The NONMEM coding patterns tested are standard pharmacometrics tasks that likely exist in training corpora, but this is not addressed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study. The study evaluates AI model outputs on coding tasks."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost, API costs, or latency are reported despite calling 7 different OpenAI models across 13 tasks with two prompt conditions (182 API calls minimum)."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No computational budget or total API spend is stated."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "OpenAI o1 demonstrated the best overall performance among seven AI agents, with a mean score of 5 (IQR 2.5-6) using the original prompt and perfect 6.0 on all 13 tasks with the optimized prompt.",
    294       "evidence": "Results section and Figure 1 show o1's median and IQR scores. With the optimized prompt, o1 achieved 6.0 on all 13 tasks.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "gpt-4.1 ranked second, scoring 5 (IQR 2.5-5) with the original prompt and also achieving a perfect 6.0 on all 13 tasks with the optimized prompt.",
    299       "evidence": "Results section and Figure 1. gpt-4.1 matched o1's perfect performance with the optimized prompt.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "The optimized prompt enhanced coding accuracy across all AI agents, with greater improvement seen in more complex models.",
    304       "evidence": "Results section and Figure 2 show task-specific improvements. Indirect response and PK models with absorption lag time showed greater improvement.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "AI agents performed well in writing basic NONMEM model structures, providing a useful foundation for pharmacometrics model coding.",
    309       "evidence": "Results section: high scores on simpler tasks (e.g., one-compartment models). Impact Assessment acknowledges limitations for complex models.",
    310       "supported": "moderate"
    311     }
    312   ],
    313   "methodology_tags": [
    314     "benchmark-eval"
    315   ],
    316   "key_findings": "The study benchmarked 7 OpenAI models on 13 NONMEM pharmacometrics coding tasks using a 3-level scoring rubric (max 6 points). OpenAI o1 and gpt-4.1 achieved the best performance, both reaching perfect scores on all tasks when given an optimized prompt embedding the rubric criteria. AI agents performed well on basic model structures but struggled with complex models involving differential equations, parameter initialization, and dataset alignment. The paper argues AI can support pharmacometrics education but warns against overreliance without expert oversight.",
    317   "red_flags": [
    318     {
    319       "flag": "Single vendor evaluation",
    320       "detail": "All 7 models evaluated are from OpenAI. Despite the broad framing of 'AI for NONMEM Coding', no models from Anthropic, Google, Meta, or other providers are tested, severely limiting generalizability claims."
    321     },
    322     {
    323       "flag": "No inter-rater reliability for rubric scoring",
    324       "detail": "The scoring rubric is applied by experts but there is no mention of multiple scorers, inter-rater agreement, or blinding. It is unclear whether a single person scored all outputs or whether consistency was verified."
    325     },
    326     {
    327       "flag": "Single-run evaluation per model-task pair",
    328       "detail": "Each model appears to be run once per task. LLM outputs are stochastic and can vary significantly across runs. No repeated sampling or temperature control is mentioned."
    329     },
    330     {
    331       "flag": "Very small task set",
    332       "detail": "Only 13 coding tasks are used to evaluate performance. This is too few for robust statistical conclusions about model ranking, especially given that all results rely on this small sample."
    333     },
    334     {
    335       "flag": "No statistical significance testing",
    336       "detail": "Claims about which model is 'best' and that the optimized prompt 'significantly improved' accuracy are made without any statistical tests on a sample of 13 tasks."
    337     },
    338     {
    339       "flag": "Missing hyperparameter reporting",
    340       "detail": "Temperature, top-p, and other sampling parameters are not reported. These critically affect output quality and reproducibility. Without this information, results cannot be reliably replicated."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "On the trustworthiness of generative foundation models: Guideline, assessment, and perspective",
    346       "authors": ["Y. Huang"],
    347       "year": 2025,
    348       "arxiv_id": "2502.14296",
    349       "relevance": "Provides guidelines for assessing trustworthiness of generative AI models, directly relevant to AI evaluation methodology."
    350     },
    351     {
    352       "title": "Agents for Change: Artificial Intelligent Workflows for Quantitative Clinical Pharmacology and Translational Sciences",
    353       "authors": ["M.H. Shahin", "S. Goswami", "S. Lobentanzer", "B.W. Corrigan"],
    354       "year": 2025,
    355       "relevance": "Discusses AI agent workflows in pharmacology, relevant to agentic AI applications in specialized scientific domains."
    356     },
    357     {
    358       "title": "Leveraging large language models in pharmacometrics: evaluation of NONMEM output interpretation and simulation capabilities",
    359       "authors": ["H.J. Cha", "K. Choe", "E. Shin", "M. Ramanathan", "S. Han"],
    360       "year": 2025,
    361       "relevance": "Directly evaluates LLMs for pharmacometrics tasks including NONMEM, a closely related benchmark study."
    362     },
    363     {
    364       "title": "ChatGPT in pharmacometrics? Potential opportunities and limitations",
    365       "authors": ["M.E. Cloesmeijer", "A. Janssen", "S.F. Koopman", "M.H. Cnossen", "R.A. Math\u00f4t"],
    366       "year": 2024,
    367       "relevance": "Early evaluation of ChatGPT for pharmacometrics, establishing baseline for LLM coding capability in this domain."
    368     },
    369     {
    370       "title": "Evaluation of ChatGPT and Gemini large language models for pharmacometrics with NONMEM",
    371       "authors": ["E. Shin", "Y. Yu", "R.R. Bies", "M. Ramanathan"],
    372       "year": 2024,
    373       "relevance": "Evaluates ChatGPT and Gemini for NONMEM coding, directly comparable benchmark study with different models."
    374     },
    375     {
    376       "title": "Check My Work?: Measuring Sycophancy in a Simulated Educational Context",
    377       "authors": ["C. Arvin"],
    378       "year": 2025,
    379       "arxiv_id": "2506.10297",
    380       "relevance": "Relevant to the paper's discussion of AI in education and the risks of sycophantic AI behavior in learning contexts."
    381     }
    382   ]
    383 }

Impressum · Datenschutz