scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28262B)
      1 {
      2   "paper": {
      3     "title": "ASTROVISBENCH: A Code Benchmark for Scientific Computing and Visualization in Astronomy",
      4     "authors": [
      5       "Sebastian Joseph",
      6       "Syed Murtaza Husain",
      7       "Stella S. R. Offner",
      8       "Stéphanie Juneau",
      9       "Paul Torrey",
     10       "Adam S. Bolton",
     11       "Juan P. Farias",
     12       "Niall Gaffney",
     13       "Greg Durrett",
     14       "Junyi Jessy Li"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv preprint",
     18     "arxiv_id": "2505.20538"
     19   },
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper states 'We release the code and data for ASTROVISBENCH at astrovisbench.github.io' in the abstract, and provides the URL https://astrovisbench.github.io/ at the top of the paper."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The benchmark data (864 tasks from 110 Jupyter notebooks) is released at the same URL. The paper says 'We release the code and data for ASTROVISBENCH.' The source notebooks are from publicly available collections (Astro Data Lab, STScI)."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Appendix F mentions the hardware used (two 56-Core Intel Xeon MAX 9480 CPUs, 128GB RAM, ~100GB storage), but there is no requirements.txt, Dockerfile, conda environment, or detailed listing of library versions needed to reproduce the benchmark execution environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "While code and data are released at the project website, the paper itself does not contain step-by-step reproduction instructions, a README with commands, or a 'Reproducing Results' section. It is unclear how to set up and run the full benchmark evaluation pipeline from the paper alone."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Table 5 reports only point estimates for all metrics (Crash %, VIscore, NoE %, MiE %, MaE %) across all eight models. No confidence intervals, error bars, or uncertainty measures are provided for any result."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper makes comparative claims (e.g., 'Gemini 2.5 Pro dominated in its ability to produce code that executes') but no statistical significance tests are used to support these comparisons. Rankings are based solely on comparing raw numbers."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. The paper presents raw percentages but does not contextualize differences with effect size measures. Some raw differences are given (e.g., crash rates range from 30.8% to 64.1%), but these lack formal effect size framing."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The benchmark contains 864 tasks (432 processing + 432 visualization) from 110 notebooks, but there is no justification for why this size is sufficient, no power analysis, and no discussion of whether the sample enables reliable model comparisons."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "All results appear to be from single runs of each model. No variance, standard deviation, or multi-run spread is reported. Each model was evaluated once with temperature=1, but no repeated evaluations were conducted to measure result stability."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Eight state-of-the-art LLMs are compared against each other as baselines (Gemini 2.5 Pro, Claude 3.7 Sonnet, Claude 4.0 Opus, o3-mini, GPT-4o, QwQ, Qwen-2.5, Llama-4 Maverick), providing mutual comparison."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The evaluated models are all contemporary and represent the state of the art at the time of writing: Gemini 2.5 Pro, Claude 4.0 Opus, Llama-4 Maverick, o3-mini, etc. These are recent models from major AI labs."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The benchmark has multiple design choices (query generation, underspecification clarification, VLM judge selection, processing/visualization split) but no ablation study measures the impact of these individual components on benchmark results."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are used: Crash % (code execution success), VIscore (variable inspection), VisFail % (visualization failure), and a three-category error classification (NoE, MiE, MaE) for visualization evaluation. Additionally, Fleiss' kappa and Spearman's rho are used for inter-annotator agreement."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Five professional astronomers (post-PhD researchers and faculty) evaluated 135x2 LLM-generated visualizations, providing gold-standard judgments. The expert annotations were used to validate the automatic VLM-as-judge evaluator (Section 3.2.1)."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The entire benchmark of 864 tasks is used for evaluation. There is no separation into dev/test splits. The 30-task agreement subset is for validating the VLM judge, not for tuning model performance, but no held-out test set concept applies to the LLM evaluations themselves."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down into processing tasks and visualization tasks (Table 5), with separate metrics for each. Error type distributions are provided (Figure 4), and visualization errors are categorized into no error, minor error, and major error."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6 provides a detailed error analysis, discussing execution errors (Figure 4 with top 10 error types), visualization errors (domain-specific convention failures, suboptimal scaling, readability issues), and Appendix E shows examples of no error, minor error, and major error judgments with expert rationales."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that all models perform poorly on the benchmark, with even the best model (Gemini 2.5 Pro) failing at least 58% of the time on visualizations. They also note that Gemini 2.5 Pro's high execution rate does not translate to high scientific correctness (VIscore was not among the highest)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims 'a significant gap in their ability to engage in astronomy research as useful assistants,' which is supported by Table 5 showing high crash rates (30-64%), low VIscores (0.47-0.69), and high major error rates (14-31%). The abstract also claims the VLM-as-judge is 'validated against annotation by five professional astronomers,' which is supported by Table 2 showing high correlations."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper largely avoids strong causal claims. It describes what models fail at (e.g., 'many of these models lack the knowledge necessary to use niche, domain-specific libraries') based on empirical observation from error analysis. The causal language used is modest and supported by the specific error patterns observed in Section 6."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper explicitly frames results within the astronomy domain and does not overclaim generalization to all scientific fields. The title and abstract specify 'astronomy domain.' The conclusion says this 'paves the way for the future development of models that can aid researchers across a wide range of domains' but frames this as future work rather than a demonstrated claim."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper does not substantively discuss alternative explanations for why models fail. For example, failures could be due to prompt design, context length limitations, or the specific choice of notebooks rather than fundamental model limitations. The Limitations section discusses noise from LLM-based construction and VLM judge limitations but does not consider alternative explanations for the observed performance gaps."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper lists model names (GPT-4o, Claude 3.7 Sonnet, Claude 4.0 Opus, Gemini 2.5 Pro, o3-mini, Qwen-2.5 72B, QwQ 32B, Llama-4 Maverick 17Bx128E) but does not provide specific API version identifiers or snapshot dates for the proprietary models. No version strings like 'gpt-4o-2024-xx-xx' are given."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper provides the full prompts used for benchmark construction (Appendix B.2), underspecification clarification (B.3), code generation (B.4), and VLM evaluation (B.1). These are the actual prompt texts with template placeholders whose fill values are defined."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Appendix D reports hyperparameter settings: temperature=1 for all models, top_p=1 for most (0.95 for Gemini), max_tokens=1024 for Claude 3.7 Sonnet (with 10 responses hitting the limit noted), and max_tokens=8000 with 3000 thinking tokens for Claude 4.0 Opus. Open-source models used Together.AI API defaults."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic scaffolding is used. The evaluation is a straightforward single-turn code generation task: given a prompt, each model generates code, which is then executed and evaluated."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 2 describes the full pipeline from notebook collection (Astro Data Lab and STScI sources) through dependency extraction, stage splitting (setup/processing/visualization), query generation with GPT-4o, and underspecification clarification. The construction methodology is detailed with filtering criteria."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "A 'Limitations' subsection appears at the end of Section 8 (Conclusion), discussing three specific limitations: LLM-aided benchmark construction may introduce noise, VLM-based evaluation may not align with expert judgments, and variable inspection has limitations with certain runtime objects."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The limitations are specific to this study: (1) GPT-4o-generated queries may introduce hallucinations (though experts verified a subset), (2) VLM judge may diverge from expert assessments (mitigated by correlation validation), (3) pickling limitations prevent evaluation of certain runtime objects. These are not generic disclaimers."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "While the paper focuses on astronomy, it does not explicitly state what the results do NOT show. For example, it does not state that results may not generalize to other scientific domains, that the notebook sources (NOIRLab, STScI tutorials) may not represent all astronomy research workflows, or that single-turn evaluation does not reflect interactive coding assistance."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The paper states that code and data are released at astrovisbench.github.io, which would include the benchmark tasks, source notebooks, and evaluation data, enabling independent verification."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 2.2 describes notebook collection from Astro Data Lab and STScI, the types of notebooks (tutorials and scientific use cases), the target audience (astronomy graduate students and professional researchers), and the range of tasks covered."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "For the expert evaluation, the paper describes the annotators as 'five professional astronomers, all of whom have a doctoral degree in astronomy, astrophysics, or physics, and are working as researchers or faculty members' (Section 3.2.1). They engaged in 6 hours of group discussions plus asynchronous annotation."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 2 documents the pipeline: notebook collection from two sources, dependency extraction from visualization cells, splitting into three stages (setup/processing/visualization), query generation with GPT-4o, underspecification clarification with GPT-4o, and code matching verification. The final benchmark sizes are stated (864 tasks from 110 notebooks)."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The Acknowledgments section discloses funding from the National Science Foundation under Cooperative Agreement 2421782, the Simons Foundation grant MPS-AI-00010515 (NSF-Simons AI Institute for Cosmic Origins - CosmicAI), and computational resources from TACC."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are clearly listed: University of Texas at Austin, NSF NOIRLab, University of Virginia, SLAC National Accelerator Laboratory, and Texas Advanced Computing Center. None of the authors appear to be affiliated with the AI companies whose models are evaluated."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "The funders (NSF, Simons Foundation) are independent scientific funding agencies with no financial stake in the performance of any particular LLM evaluated in the benchmark."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement is provided in the paper. While this appears to be a purely academic study, the absence of a formal declaration means this criterion is not met."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The paper evaluates eight LLMs on benchmark tasks but does not state the training data cutoff dates for any of the models. This is relevant because the source notebooks from Astro Data Lab and STScI are publicly available and could be in training data."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The source notebooks are publicly available GitHub repositories (github.com/astro-datalab/notebooks-latest, spacetelescope.github.io/notebook-infrastructure/), which are likely in the training data of the evaluated models. This potential overlap is not discussed anywhere in the paper."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The benchmark is constructed from publicly available Jupyter notebooks that have been online for years. Since most evaluated models were trained on web data including GitHub repositories, contamination is a serious concern. The paper does not address this at all, despite it being a critical threat to validity for a benchmark paper."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "The expert annotation task is a benchmark validation exercise with professional astronomers, not a human subjects study requiring pre-registration."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "The expert annotation involves professional researchers providing domain judgments, not a human subjects experiment requiring IRB approval."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "The expert annotators are professional astronomers serving as domain validators, not study participants. Their qualifications (doctoral degrees, researcher/faculty status) are reported as relevant background."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "Expert annotators were domain specialists recruited for benchmark validation, not participants in a human subjects study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human subjects experiment with randomization to conditions was conducted."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human subjects experiment requiring blinding was conducted."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human subjects study with potential attrition was conducted."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper does not report any API costs, tokens consumed per model, or wall-clock time for running the benchmark evaluations across eight models. This is relevant since the evaluation involves 864 tasks per model plus VLM-based evaluation."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Appendix F states the hardware used: 'two 56-Core Intel Xeon MAX 9480 CPUs with 128GB of RAM' and notes '~100GB of storage' per model evaluation, with '~50GB for the execution environment' and '~50GB for pickled objects.' However, total API spend and GPU hours are not mentioned."
    291       }
    292     }
    293   },
    294   "claims": [
    295     {
    296       "claim": "ASTROVISBENCH is the first benchmark for both scientific computing and visualization in the astronomy domain.",
    297       "evidence": "The paper constructs 864 tasks (432 processing + 432 visualization) from 110 Jupyter notebooks sourced from Astro Data Lab and STScI collections (Section 2). Table 6 in Appendix C compares with related benchmarks and shows no prior astronomy-specific workflow benchmark exists.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "The VLM-as-judge (Claude 3.5 Sonnet) achieves high correlation with expert astronomer judgments for visualization evaluation.",
    302       "evidence": "Table 2 shows Claude 3.5 Sonnet achieves Spearman correlation of 0.822 (avg) and 0.828 (majority) with expert judgments, with p < 1e-29. This was validated on 30 tasks annotated by all 5 experts (Section 3.2.2).",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "State-of-the-art LLMs show a significant gap in their ability to engage in astronomy research workflows as useful assistants.",
    307       "evidence": "Table 5 shows crash rates of 30.8-64.1% for processing tasks, VIscores of 0.47-0.69, and combined visualization failure (crash + VisFail + MaE) rates exceeding 58% for all models. Even the best model (Gemini 2.5 Pro) produces correct visualizations only 15.7% of the time (Section 5).",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "Better code execution success does not necessarily mean better scientific computing accuracy.",
    312       "evidence": "Gemini 2.5 Pro had the lowest crash rate (30.8%) but its VIscore (0.600) was not the highest — o3-mini had a higher VIscore (0.694) despite a higher crash rate (51.4%). This is discussed in Section 5.1.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "LLMs lack knowledge necessary to use niche, domain-specific libraries and APIs in astronomy.",
    317       "evidence": "Section 6 identifies FileNotFoundError, queryClientError (ADQL queries), and AttributeError as top errors. The paper attributes these to LLMs hallucinating file paths, function calls, and arguments for domain-specific tools. Table 4 lists 38 libraries with 26 astronomy-specific ones.",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "ASTROVISBENCH introduces 864 benchmark tasks for evaluating LLMs on astronomy-specific scientific computing and visualization workflows. Evaluation of eight state-of-the-art LLMs reveals substantial performance gaps: even the best model (Gemini 2.5 Pro) crashes on 30.8% of processing tasks and produces correct visualizations only 15.7% of the time. The paper develops a VLM-as-judge approach validated against five professional astronomers (Spearman rho = 0.828 with majority labels). Key failure modes include inability to use domain-specific APIs, hallucinating file paths and function calls, and violations of astronomy-specific plotting conventions.",
    325   "red_flags": [
    326     {
    327       "flag": "Benchmark contamination not addressed",
    328       "detail": "The source notebooks are from publicly available GitHub repositories (astro-datalab/notebooks-latest, STScI collections) that have been online for years. Since all evaluated models train on web data including GitHub, the test data may already be in training sets. This fundamental contamination concern is never discussed, which could inflate or distort model performance patterns."
    329     },
    330     {
    331       "flag": "No variance or repeated runs",
    332       "detail": "All eight models were evaluated with temperature=1 (stochastic sampling) but results appear to be from single runs only. With 864 tasks and stochastic generation, different runs could yield meaningfully different results. No standard deviations, confidence intervals, or repeated measurements are reported."
    333     },
    334     {
    335       "flag": "Small expert validation sample",
    336       "detail": "The VLM-as-judge is validated against only 30 tasks annotated by all 5 experts. While the correlation is high (0.828), this is a small sample to validate an automatic evaluator that is then used to assess all 432 visualization tasks across 8 models. Moderate inter-annotator agreement (Fleiss' kappa = 0.53) on this small sample adds further uncertainty."
    337     },
    338     {
    339       "flag": "GPT-4o used in benchmark construction",
    340       "detail": "GPT-4o was used to generate queries and handle underspecification clarification during benchmark construction, but GPT-4o is also one of the evaluated models. This creates a potential advantage or disadvantage for GPT-4o depending on whether the generated queries are biased toward or against its capabilities."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    346       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    347       "year": 2024,
    348       "arxiv_id": "2310.06770",
    349       "relevance": "Major benchmark for evaluating LLM coding ability on real-world software engineering tasks, directly comparable to ASTROVISBENCH's domain-specific code evaluation."
    350     },
    351     {
    352       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    353       "authors": ["Terry Yue Zhuo"],
    354       "year": 2025,
    355       "relevance": "Benchmark for LLM code generation with diverse function calls, relevant to evaluating how models handle complex API usage."
    356     },
    357     {
    358       "title": "Evaluating large language models trained on code",
    359       "authors": ["Mark Chen", "Jerry Tworek"],
    360       "year": 2021,
    361       "arxiv_id": "2107.03374",
    362       "relevance": "Foundational HumanEval benchmark for LLM code generation evaluation, establishing the evaluation paradigm used in this work."
    363     },
    364     {
    365       "title": "The AI scientist: Towards fully automated open-ended scientific discovery",
    366       "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange", "Jakob Foerster", "Jeff Clune", "David Ha"],
    367       "year": 2024,
    368       "relevance": "Explores fully automated AI-driven scientific research, directly relevant to the goal of LLMs as scientific research assistants."
    369     },
    370     {
    371       "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research",
    372       "authors": ["Giulio Starace", "Oliver Jaffe", "Dane Sherburn"],
    373       "year": 2025,
    374       "relevance": "Benchmark evaluating AI's ability to replicate ML research workflows, closely related to ASTROVISBENCH's focus on scientific research workflows."
    375     },
    376     {
    377       "title": "SciCode: A research coding benchmark curated by scientists",
    378       "authors": ["Minyang Tian", "Luyu Gao"],
    379       "year": 2024,
    380       "relevance": "Benchmark for scientific coding tasks curated by domain experts, directly comparable to ASTROVISBENCH's approach of expert-curated scientific benchmarks."
    381     },
    382     {
    383       "title": "MLE-bench: Evaluating machine learning agents on machine learning engineering",
    384       "authors": ["Jun Shern Chan", "Neil Chowdhury"],
    385       "year": 2025,
    386       "relevance": "Benchmark for evaluating ML engineering capabilities of LLM agents, relevant as a scientific coding benchmark comparison."
    387     },
    388     {
    389       "title": "MatPlotAgent: Method and evaluation for LLM-based agentic scientific data visualization",
    390       "authors": ["Zhiyu Yang", "Zihan Zhou"],
    391       "year": 2024,
    392       "relevance": "Agentic framework for LLM-based visualization code generation, directly relevant as a visualization-focused LLM evaluation approach."
    393     },
    394     {
    395       "title": "VisEval: A Benchmark for Data Visualization in the Era of Large Language Models",
    396       "authors": ["Nan Chen", "Yuge Zhang", "Jiahang Xu", "Kan Ren", "Yuqing Yang"],
    397       "year": 2025,
    398       "relevance": "Benchmark for evaluating LLM-generated visualizations using execution-based and VLM evaluation, directly comparable evaluation methodology."
    399     },
    400     {
    401       "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation",
    402       "authors": ["Qian Huang", "Jian Vora", "Percy Liang", "Jure Leskovec"],
    403       "year": 2023,
    404       "relevance": "Benchmark for LLM agents performing ML experimentation tasks, relevant to the broader theme of AI-assisted scientific research."
    405     },
    406     {
    407       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    408       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    409       "year": 2023,
    410       "relevance": "Rigorous evaluation methodology for LLM code generation, relevant to understanding evaluation challenges in code benchmarks."
    411     }
    412   ]
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs