scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27662B)
      1 {
      2   "paper": {
      3     "title": "Plot2Code: A Comprehensive Benchmark for Evaluating Multi-modal Large Language Models in Code Generation from Scientific Plots",
      4     "authors": [
      5       "Chengyue Wu",
      6       "Zhixuan Liang",
      7       "Yixiao Ge",
      8       "Qiushan Guo",
      9       "Zeyu Lu",
     10       "Jiahao Wang",
     11       "Ying Shan",
     12       "Ping Luo"
     13     ],
     14     "year": 2025,
     15     "venue": "Findings of the Association for Computational Linguistics: NAACL 2025",
     16     "doi": "10.18653/v1/2025.findings-naacl.164"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "Plot2Code introduces a 368-plot benchmark (matplotlib + plotly) for evaluating MLLMs' ability to generate code from scientific plots. GPT-4V achieves the highest rating of 7.68/10, with open-source models significantly lagging behind closed-source ones. Traditional low-level metrics (MSE, SSIM) fail to distinguish image quality, while the proposed GPT-4V rating and text-match ratio show significant discriminative power validated against human evaluation (Pearson r=0.479). Prompt strategies like CoT and Plan-and-Solve do not clearly improve performance on this visual coding task.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "GitHub repository is provided: https://github.com/TencentARC/Plot2Code (linked in the abstract/header area)."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Dataset released on HuggingFace under Apache-2.0 license: https://huggingface.co/datasets/TencentARC/Plot2Code (Appendix E.6)."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is mentioned in the paper. Only general library names (matplotlib, plotly) are referenced."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. While prompts are given in the appendix, there are no instructions for running the full evaluation pipeline."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Main results in Table 3 report point estimates only (e.g., pass rates, ratings) with no confidence intervals or error bars."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5 performs hypothesis tests (two-sample t-tests) to validate evaluation metrics, reporting t-statistics and p-values in Table 4. Correlation analysis with Kendall's Tau, Pearson, and Spearman coefficients in Table 5."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Raw scores and differences between models are reported, but no formal effect sizes (Cohen's d, odds ratios, etc.) are provided."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Section 5 ('Robustness Analysis upon Sample Size') shows that scores stabilize above 60 samples (Figure 8), empirically justifying the 132-sample matplotlib benchmark size."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Main evaluation results in Table 3 are single-run point estimates with no variance, standard deviation, or spread measures across experimental runs."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "14 models are compared, including both closed-source (GPT-4V, Claude-3-Opus, Gemini-Pro) and open-source (DeepSeek-VL, LLaVA, Mini-Gemini) models (Table 3)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Models evaluated include contemporary systems: GPT-4V, Claude-3-Opus, Gemini-Pro, DeepSeek-VL, LLaVA-1.6, all from 2023-2024."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section 4.3 ablates prompt strategies (CoT, PS+), resolution settings (low vs. high), and OCR token inclusion (Figure 5). Section 4.4 provides pairwise comparisons."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Three evaluation metrics are used: code pass rate, text-match ratio, and GPT-4V rating judgement (Section 3.4)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Human evaluation is conducted: 100 people rate GPT-4V generated samples (absolute rating), and 46 people complete pairwise comparison questionnaires (Appendix C)."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The benchmark is a dedicated test set. Pre-trained models are evaluated zero-shot without any fine-tuning on the benchmark data."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Figure 6 provides per-plot-type breakdowns across 6 categories for three models. Table 1 shows statistics by type."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Appendix B shows case studies including a direct asking failure (Figure 10, rated 3/10). The paper discusses challenges with text-dense plots and cases where models struggle."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "CoT and PS+ prompt strategies show no clear advantage (Section 4.3, Figure 4). Higher resolution does not always help (Mini-Gemini-34B-HD underperforms Mini-Gemini-8x7B-HD)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims about GPT-4V achieving 7.68/10, text-dense plot challenges, GPT-4V rating correlating with human evaluation, and cross-validation consistency are all supported by results in Sections 4-5."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims (e.g., 'inclusion of image input is beneficial,' 'adding OCR tokens improved performance') are supported by controlled ablation comparisons where a single variable is changed (Figure 4, Figure 5)."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims 'Comprehensive Benchmark' for code generation from 'Scientific Plots,' but the benchmark only covers matplotlib and plotly libraries. The Limitations section acknowledges it 'may not fully capture the complexity and variety of other types of programming tasks' but the title and abstract overclaim comprehensiveness."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper does not discuss alternative explanations for the observed performance differences. For example, performance gaps between open-source and closed-source models could be due to training data differences, model size, or instruction tuning quality, but these are not explored."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 5 explicitly validates the GPT-4V rating proxy against human evaluation, showing moderate correlation (Pearson 0.479). They also demonstrate that traditional metrics (MSE, SSIM) fail as proxies (Table 4), distinguishing between proxy and actual visual similarity."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Models are referred to by marketing names only: 'GPT-4V,' 'Claude-3-Opus,' 'Gemini-Pro,' 'DeepSeek-VL-7B.' No API snapshot dates or specific version identifiers are provided."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full prompt texts for code generation (direct and conditional asking), instruction generation, evaluation rating, and pair evaluation are provided in Appendix A.1-A.4."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No mention of temperature, top-p, max tokens, or other API sampling settings for any of the 14 evaluated models."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. Models receive direct prompts and produce single-turn code outputs."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 3.1 documents the three-stage filtering pipeline: generation filtering (extracting single code blocks from HTML), type filtering (excluding animations/widgets), and manual curation (3 criteria specified)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "A dedicated 'Limitations' paragraph follows Section 6 (Conclusion), discussing potential contamination and scope constraints."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The Limitations section identifies two specific threats: (1) contamination from pre-training data potentially including benchmark plots, and (2) the dataset not capturing other programming task types beyond chart generation."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The Limitations section states the benchmark 'may not fully capture the complexity and variety of other types of programming tasks beyond chart generation,' explicitly bounding the scope to plotting tasks."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Dataset with plots, code, and instructions is publicly available on HuggingFace (https://huggingface.co/datasets/TencentARC/Plot2Code) under Apache-2.0 license."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3.1 describes the collection: crawling matplotlib/plotly gallery website links, extracting code blocks, then three-stage filtering."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "For the 46-person pairwise evaluation, participants are described as 'colleagues from the lab, who hold at least a bachelor's degree.' For the 100-person absolute rating study, no recruitment details are provided at all."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 3.1 documents the full pipeline: web crawling → generation filtering (single code blocks) → type filtering (excluding animations, widgets, event handling) → manual curation (3 criteria), resulting in 368 examples."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding sources or acknowledgments section is present in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: University of Hong Kong, ARC Lab at Tencent PCG, and Shanghai Jiao Tong University."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Authors from Tencent (which develops Mini-Gemini) evaluate their own models alongside competitors. No funding source is disclosed, making independence unassessable."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interest declaration is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the 14 models evaluated, despite the benchmark data coming from publicly available matplotlib/plotly galleries."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The Limitations section mentions 'potential contamination, as pre-trained models might use the data as part of its training data' but provides no analysis of actual overlap."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The matplotlib/plotly gallery examples are publicly available online and were almost certainly included in training data for models like GPT-4V. The paper acknowledges this risk but takes no steps to address it (no canary strings, no temporal analysis, no decontamination)."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No pre-registration is mentioned for either the 100-person absolute rating or the 46-person pairwise evaluation studies."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No IRB or ethics board approval is mentioned for the human evaluation studies."
    261       },
    262       "demographics_reported": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "For the 46-person study, only 'colleagues from the lab, who hold at least a bachelor's degree' is stated. For the 100-person study, no demographics are reported."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "Minimal criteria stated for 46-person study ('bachelor's degree'). No criteria stated for the 100-person study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "This is a rating/evaluation task, not an experimental study comparing conditions among human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "Not an experimental study on human participants. The pairwise evaluation does swap image positions to mitigate order bias (Section 4.4), but this is about evaluator bias, not participant blinding in an experimental sense."
    281       },
    282       "attrition_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No information on how many people were invited versus how many completed the evaluation tasks."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No API costs, tokens consumed, or wall-clock time reported for evaluating 14 models across 368 plots."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No computational budget or hardware specifications are mentioned for running the evaluation pipeline."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Results in Table 3 appear to be single-run evaluations. No multi-seed analysis is performed for model outputs."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of evaluation runs per model is not stated. Results appear to be from a single evaluation pass."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search is described. Default settings appear to be used but are not reported."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "All configurations are reported (default, CoT, PS+) in Table 3 and Figure 4, not just the best one. No cherry-picking of the best configuration."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Multiple comparisons across 14 models and multiple metrics are reported without any correction for multiple testing (e.g., Bonferroni)."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Authors from Tencent evaluate Mini-Gemini (their own model) alongside competitors without acknowledging potential author-evaluation bias."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Models of vastly different sizes (2B to unknown GPT-4V size) are compared without discussing compute requirements or matching compute budgets."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "Section 5 validates the proposed metrics through hypothesis testing (Table 4), demonstrating that GPT-4V rating and text-match ratio significantly distinguish image quality groups, while traditional metrics (MSE, SSIM) do not."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. Models are prompted directly with single-turn inference."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The benchmark uses matplotlib/plotly gallery examples that have been publicly available for years. Models trained after these galleries were published may have seen the exact code solutions. This temporal leakage is not addressed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "In the conditional asking setting, GPT-4 generated instructions may encode implementation-specific details from the ground truth code. This potential feature leakage is not discussed."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "All plots come from official gallery examples. Models trained on web data likely encountered these exact examples. The non-independence between training data and test data is not analyzed."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection method is used (no canary strings, no membership inference, no n-gram overlap analysis)."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Plot2Code poses considerable challenges, with GPT-4V achieving an overall score of 7.68/10 in conditional asking",
    373       "evidence": "Table 3 shows GPT-4V (PS+) achieves 7.83 maximum rating; baseline GPT-4V achieves 7.68. Even the best models score well below 10.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Open-source models significantly lag behind closed-source models",
    378       "evidence": "Table 3: best open-source (Mini-Gemini-8x7B-HD) scores 6.08 GPT-4V rating and 58.4% pass rate, vs closed-source GPT-4V at 7.68 rating and 84.1% pass rate.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "GPT-4V rating judgement correlates well with human evaluations",
    383       "evidence": "Table 5: Pearson r=0.479 (p=6.89×10⁻⁵⁴), Kendall's Tau=0.437 (p=8.68×10⁻⁴⁹), Spearman r=0.469 (p=1.57×10⁻⁵¹). Correlation stabilizes above 60 samples (Figure 8).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "GPT-4V rating and text-match ratio significantly distinguish image quality groups, unlike MSE and SSIM",
    388       "evidence": "Table 4: GPT-4V rating p=1.22×10⁻¹¹, text-match ratio p=9.62×10⁻⁷, rejecting H0. MSE (p=0.22) and SSIM (p=0.21) fail to reject H0.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Chain-of-Thought and Plan-and-Solve prompt strategies do not yield clear advantages",
    393       "evidence": "Figure 4 and Table 3: GPT-4V (CoT) achieves 7.75 vs GPT-4V baseline 7.68 rating; pairwise evaluation shows CoT wins 40%, ties 49%, loses 11% vs baseline.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Cross-validation among MLLMs shows high consistency in ratings with minimal bias",
    398       "evidence": "Figure 9: Cronbach's Alpha values of 0.77, 0.84, and 0.82 for GPT-4V, Claude, and Gemini generated samples respectively.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Higher resolution vision encoders improve MLLM performance on Plot2Code",
    403       "evidence": "Table 3 comparison of low-res vs HD variants: Mini-Gemini-8x7B goes from 3.76 to 3.87 (direct asking rating), and 5.74 to 6.08 (conditional asking rating).",
    404       "supported": "weak"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "Severe contamination risk",
    410       "detail": "Benchmark plots are scraped from matplotlib/plotly official galleries — publicly available web content almost certainly included in training data of all evaluated models. The paper acknowledges this but takes no steps to measure or mitigate it, undermining the validity of all benchmark scores."
    411     },
    412     {
    413       "flag": "Self-evaluation conflict",
    414       "detail": "Authors from Tencent ARC Lab evaluate Mini-Gemini (their own model family) alongside competitors. While results do not favor Mini-Gemini, the conflict is not disclosed or discussed."
    415     },
    416     {
    417       "flag": "No error bars on main results",
    418       "detail": "All model evaluations in Table 3 appear to be single-run results with no uncertainty quantification. LLM outputs are stochastic, so single-run results may not be representative."
    419     },
    420     {
    421       "flag": "Moderate human-AI correlation claimed as strong",
    422       "detail": "Pearson correlation of 0.479 between GPT-4V and human ratings is described as showing the metric 'correlates well' and is 'in general agreement,' but 0.479 is a moderate correlation explaining only ~23% of variance."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Evaluating large language models trained on code",
    428       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    429       "year": 2021,
    430       "arxiv_id": "2107.03374",
    431       "relevance": "Introduces HumanEval, a foundational LLM code generation benchmark that Plot2Code extends to multi-modal settings."
    432     },
    433     {
    434       "title": "Program synthesis with large language models",
    435       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    436       "year": 2021,
    437       "arxiv_id": "2108.07732",
    438       "relevance": "Introduces MBPP benchmark for code generation evaluation, a key baseline for comparing code benchmarks."
    439     },
    440     {
    441       "title": "Design2code: How far are we from automating front-end engineering?",
    442       "authors": ["Chenglei Si", "Yanzhe Zhang", "Zhengyuan Yang", "Ruibo Liu", "Diyi Yang"],
    443       "year": 2024,
    444       "arxiv_id": "2403.03163",
    445       "relevance": "Evaluates MLLMs on UI-to-code generation, directly related to visual code generation benchmarking."
    446     },
    447     {
    448       "title": "MMCode: Evaluating multi-modal code large language models with visually rich programming problems",
    449       "authors": ["Kaixin Li", "Yuchen Tian", "Qisheng Hu", "Ziyang Luo", "Jing Ma"],
    450       "year": 2024,
    451       "arxiv_id": "2404.09486",
    452       "relevance": "Multi-modal code benchmark integrating images into code tasks, a direct comparator to Plot2Code."
    453     },
    454     {
    455       "title": "Code Llama: Open foundation models for code",
    456       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    457       "year": 2023,
    458       "arxiv_id": "2308.12950",
    459       "relevance": "Major open-source code LLM that represents the state of code generation capabilities."
    460     },
    461     {
    462       "title": "DeepSeek-VL: Towards real-world vision-language understanding",
    463       "authors": ["Haoyu Lu", "Wen Liu", "Bo Zhang"],
    464       "year": 2024,
    465       "arxiv_id": "2403.05525",
    466       "relevance": "Open-source multi-modal LLM evaluated in the benchmark, relevant to understanding MLLM capabilities."
    467     },
    468     {
    469       "title": "ChartQA: A benchmark for question answering about charts with visual and logical reasoning",
    470       "authors": ["Ahmed Masry", "Do Xuan Long", "Jia Qing Tan", "Shafiq Joty", "Enamul Hoque"],
    471       "year": 2022,
    472       "arxiv_id": "2203.10244",
    473       "relevance": "Chart understanding benchmark that complements Plot2Code's chart generation evaluation."
    474     },
    475     {
    476       "title": "DeepSeek-Coder: When the large language model meets programming",
    477       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    478       "year": 2024,
    479       "arxiv_id": "2401.14196",
    480       "relevance": "Code-specialized LLM relevant to understanding code generation capabilities in the agentic AI space."
    481     },
    482     {
    483       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    484       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    485       "year": 2024,
    486       "relevance": "Establishes LLM-as-judge evaluation methodology that Plot2Code adapts for visual similarity rating."
    487     },
    488     {
    489       "title": "StarCoder: May the source be with you!",
    490       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    491       "year": 2023,
    492       "arxiv_id": "2305.06161",
    493       "relevance": "Major open-source code LLM relevant to understanding code generation model development."
    494     },
    495     {
    496       "title": "ChartInstruct: Instruction tuning for chart comprehension and reasoning",
    497       "authors": ["Ahmed Masry", "Mehrad Shahmohammadi", "Md Rizwan Parvez"],
    498       "year": 2024,
    499       "arxiv_id": "2403.09028",
    500       "relevance": "Chart understanding instruction tuning benchmark, directly related to evaluating MLLM chart capabilities."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 1,
    506       "justification": "A benchmark dataset useful for researchers evaluating MLLMs, but not directly usable as a practitioner tool."
    507     },
    508     "surprise_contrarian": {
    509       "score": 0,
    510       "justification": "Results confirm expected patterns: GPT-4V leads, open-source lags, prompt strategies don't help much."
    511     },
    512     "fear_safety": {
    513       "score": 0,
    514       "justification": "No safety or security concerns raised by the work."
    515     },
    516     "drama_conflict": {
    517       "score": 0,
    518       "justification": "No controversial findings or challenges to established claims."
    519     },
    520     "demo_ability": {
    521       "score": 2,
    522       "justification": "Code on GitHub and dataset on HuggingFace allow researchers to reproduce evaluations."
    523     },
    524     "brand_recognition": {
    525       "score": 2,
    526       "justification": "Evaluates well-known models (GPT-4V, Claude-3, Gemini) and comes from Tencent/HKU."
    527     }
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs