scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21954B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Drawing Pandas: A Benchmark for LLMs in Generating Plotting Code",
      6     "authors": [
      7       "Timur Galimzyanov",
      8       "Sergey Titov",
      9       "Yaroslav Golubev",
     10       "Egor Bogomolov"
     11     ],
     12     "year": 2024,
     13     "venue": "IEEE Working Conference on Mining Software Repositories",
     14     "arxiv_id": "2412.02764",
     15     "doi": "10.1109/MSR66628.2025.00083"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims (175 tasks, library comparisons, task compression findings) are directly supported by experimental results in Tables I-III.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Main causal claims tested with controlled experiments (5 runs per condition, explicit library assignment). Causality established for task compression and library effects, though mechanisms (e.g., why Plotly fails) are partially explained.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Scope explicitly bounded to 175 Matplotlib-derived tasks, Pandas DataFrames, Python only. Limitations section acknowledges 'currently limited to 175 data points originating from the Matplotlib gallery' and notes DataFrames are 'mostly concise...often far from real-world data.'",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Explains Plotly failures via 'incorrect usage of the Plotly library API' (manually reviewed 38 cases). Task compression robustness attributed to 'automatically generated DataFrame descriptions and thorough instructions.' However, alternative explanations for library differences (API complexity vs. training data availability) not deeply explored.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Measures execution (incorrect code %), visual similarity (0-100), task adherence (0-100), and validates against human judgment (r=0.85 task, r=0.66 visual). Acknowledges 'generated images do not have to match ground-truth plots exactly.' Distinguishes between metrics and validates correlation with human judgment.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Dedicated Section V 'LIMITATIONS AND FUTURE WORK' discusses specific threats and boundaries.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats: (1) 'models may still exhibit bias towards publicly available ground-truth Matplotlib code' (explicitly named); (2) potential OpenAI bias in task generation (acknowledged but argued negligible); (3) only 25% of tasks from real Matplotlib gallery vs. 75% manually created. (4) 'DataFrames...mostly concise...often far from real-world data.' (5) Single-expert human validation only.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Explicitly stated: 175 tasks from Matplotlib gallery, Pandas DataFrames only, Python only, three libraries, synthetic data, single expert validation. Future work section lists planned expansions (other data sources, richer DataFrames, other languages, multi-expert validation).",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source disclosed in acknowledgments or funding section. Authors from JetBrains Research and Delft University but no funding statement.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations clearly stated: JetBrains Research and Delft University of Technology. No apparent conflict (no JetBrains products are benchmarked).",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funder identified in paper.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or declaration of financial interests, patents, equity, or consulting relationships.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms reasonably clear from context: 'benchmark' standard in field; 'plotting code generation' and 'visualization' self-evident; 'data leakage' explained as contamination via synthetic generation. 'Effectiveness as assistant' somewhat vague but used consistently to mean ability to generate correct plotting code.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Contribution explicitly stated: 'introduces the human-curated Pandas-PlotBench dataset' with 175 tasks 'to overcome existing gap' in prior benchmarks. Positioned as larger (vs. MatPlotBench's 25), more data-realistic, and more extensible than alternatives.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section IV directly compares to MatPlotBench (175 vs 25 data tasks), Plot2Code (132 but image-guided), ChartMimic (1000 but image-based), nvBench (larger but SQL/Vega-Lite), DS-1000 (too simple: 3-line solutions). Clear positioning of contribution. Could go deeper on methodological rationales (why data-based > image-based) but adequate engagement.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "benchmark-creation": {
    119       "construct_design": {
    120         "construct_validity_argued": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Paper argues this benchmark measures 'LLMs' effectiveness as assistants in visual data exploration' by testing practical task: 'generating code for plotting data loaded in Pandas DataFrame format based on natural language instructions.' Construct is clear (practical plotting capability) though not deeply theorized.",
    124           "source": "haiku"
    125         },
    126         "difficulty_distribution_characterized": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "No explicit difficulty tiers or difficulty metrics reported. Table III shows task length variation (736→154 symbols) affecting performance, but no formal characterization of difficulty distribution or complexity levels across the 175 tasks.",
    130           "source": "haiku"
    131         },
    132         "ceiling_floor_effects_checked": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "No explicit discussion of ceiling/floor, but empirical results show reasonable spread: visual scores range 44-75%, task-based 36-89%, with Plotly at floor (~68%) and GPT-4o approaching ceiling at 89%. Adequate discriminability demonstrated.",
    136           "source": "haiku"
    137         },
    138         "human_baseline_included": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Paper includes single-expert human scoring for validation (first author, 5 years experience) and reports correlation with automated scoring (r=0.85), but does NOT report human performance scores on the benchmark itself. Correlation ≠ baseline performance.",
    142           "source": "haiku"
    143         },
    144         "scoring_rubric_justified": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Two-metric rubric justified: visual scoring (comparing plots 0-100, 'focusing on main idea') and task-based scoring (adherence to task description). Task-based preferred over visual based on human correlation (r=0.85 vs 0.66). CodeBERT Score explicitly tested and rejected as 'inadequate.' Rubric is empirically validated.",
    148           "source": "haiku"
    149         }
    150       },
    151       "robustness": {
    152         "contamination_resistance_designed": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Synthetic data generation ('tasks and data files are synthetically generated') and synthetic task creation via GPT-4V provides contamination resistance. However, data source (Matplotlib gallery) is public and likely in training data. Paper acknowledges 'models may still exhibit bias towards publicly available ground-truth Matplotlib code' despite modifications.",
    156           "source": "haiku"
    157         },
    158         "temporal_robustness_discussed": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No discussion of benchmark degradation, gaming over time, or version management strategy. Only future work mentions 'expansion to other sources' without addressing temporal robustness or obsolescence risk.",
    162           "source": "haiku"
    163         },
    164         "failure_modes_discussed": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "Discusses dataset limitations (limited to Matplotlib gallery, concise DataFrames, Python only, single expert validation) but does NOT discuss failure modes of benchmark as measurement instrument. What real scenarios does it miss? What could game it? Not addressed.",
    168           "source": "haiku"
    169         },
    170         "baseline_implementations_provided": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Paper states 'open-source our implementation of the baseline methods and the evaluation code' with GitHub link (JetBrains-Research/PandasPlotBench) and HuggingFace dataset link. Others can reproduce reported scores.",
    174           "source": "haiku"
    175         }
    176       },
    177       "documentation": {
    178         "dataset_documentation_complete": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Section II provides detailed documentation: data collection process (5 steps with filtering, splitting, validation), data characteristics (CSV files, loading scripts, ground truth code, three task versions). Supplementary materials referenced for prompts and examples. No formal data card mentioned but coverage is comprehensive.",
    182           "source": "haiku"
    183         },
    184         "licensing_and_access_clear": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Dataset and code explicitly published: 'Our dataset and benchmark code are available online' with HuggingFace (JetBrains-Research/PandasPlotBench) and GitHub links provided. Likely permissive license (JetBrains research, open platforms) though specific license not stated in paper.",
    188           "source": "haiku"
    189         },
    190         "intended_use_specified": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Intended use clearly stated: 'evaluate language models' effectiveness as assistants in visual data exploration' for 'generating code for plotting tabular data.' Scope is practical (real-world plotting tasks) though no explicit guidance on misuses or invalid conclusions.",
    194           "source": "haiku"
    195         }
    196       }
    197     }
    198   },
    199   "claims": [
    200     {
    201       "claim": "Task compression (reducing instruction length from 736 to 154 symbols) has minimal effect on plotting code generation performance",
    202       "evidence": "Table III shows task-based scoring decreases only from 89% to 85% despite 3.4× reduction in task text. Visual scores 75%→71%. Five runs per condition for robustness.",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "LLMs perform significantly better with Matplotlib and Seaborn than with Plotly",
    207       "evidence": "Table II: Matplotlib 89% task-based, Seaborn 84%, Plotly 68%. Incorrect code: 1.8% (Matplotlib) vs 22% (Plotly). Manual review identified Plotly API misuse as primary failure mode.",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "GPT-4o is the fastest among highest-performing models on this benchmark",
    212       "evidence": "Table I: GPT-4o achieves 89% task-based score in 4.8s/item; Claude 3.5 Sonnet achieves 88% in 5.8s/item; Claude 3 Opus achieves 87% in 14.0s/item.",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "Synthetic task generation protects this benchmark from data leakage",
    217       "evidence": "Tasks and DataFrames 'synthetically generated' and 'never seen by any LLMs.' However, tasks derived from public Matplotlib gallery code which likely appears in training data.",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "Task-based scoring is more reliable than visual scoring for evaluating generated plots",
    222       "evidence": "Correlation with human expert judgment: task-based r=0.85 vs visual r=0.66. Suggests plot adherence to instructions better predicts quality than pixel-level visual similarity.",
    223       "supported": "strong"
    224     },
    225     {
    226       "claim": "Large Llama models (70B, 405B) perform comparably to proprietary state-of-the-art LLMs",
    227       "evidence": "Table I: Llama 3.1 405B achieves 86% task-based (vs GPT-4o 89%), Llama 70B achieves 82% (vs Claude Sonnet 88%). Smaller Llama models (8B, 3B) fail significantly.",
    228       "supported": "strong"
    229     },
    230     {
    231       "claim": "DataFrame descriptions using head(5) method with column types are optimal",
    232       "evidence": "Paper states 'head(5) method...supplemented with column names and types...delivered the best performance' but no quantitative comparison with alternatives provided.",
    233       "supported": "moderate"
    234     }
    235   ],
    236   "methodology_tags": [
    237     "benchmark-eval",
    238     "observational"
    239   ],
    240   "key_findings": "PandasPlotBench introduces a 175-task benchmark for evaluating LLMs' ability to generate plotting code from natural language instructions on Pandas DataFrames. GPT-4o achieves 89% task-based accuracy, significantly outperforming smaller models; Claude 3.5 Sonnet (88%) and Llama 405B (86%) also perform well. Notably, task compression (reducing instructions from 736 to 154 symbols) has minimal impact on performance (89%→85%), suggesting concise user input is viable for practical interfaces. Performance varies substantially by plotting library: Matplotlib/Seaborn achieve 89%/84% task-based scores, but Plotly drops to 68% due to API complexity. Task-based scoring (r=0.85 with human judgment) proves more reliable than visual scoring (r=0.66) for quality assessment.",
    241   "red_flags": [
    242     {
    243       "flag": "Single human validator",
    244       "detail": "Manual scoring by first author only; no inter-rater reliability testing or multiple experts. Limits generalizability of human correlation results."
    245     },
    246     {
    247       "flag": "Matplotlib gallery source bias",
    248       "detail": "All 175 tasks derived from publicly available Matplotlib gallery. Despite synthetic task generation, models trained on gallery code could exhibit memorization bias. Paper acknowledges but does not quantify this risk."
    249     },
    250     {
    251       "flag": "GPT-4o as sole judge",
    252       "detail": "Both visual and task-based scoring use GPT-4o as judge. Could systematically bias against non-OpenAI models or favor GPT-4o's coding style."
    253     },
    254     {
    255       "flag": "Limited human baseline",
    256       "detail": "No human performance scores reported. Only expert correlation with automated scoring (r=0.85), which does not establish absolute quality threshold."
    257     },
    258     {
    259       "flag": "Small task pool",
    260       "detail": "175 tasks is modest. Difficulty distribution not characterized; coverage of real-world plotting scenarios not justified."
    261     },
    262     {
    263       "flag": "Ceiling effects unaddressed",
    264       "detail": "GPT-4o task-based score of 89% approaches ceiling. No discussion of whether benchmark discriminates among top-tier models."
    265     },
    266     {
    267       "flag": "No temporal robustness plan",
    268       "detail": "No discussion of benchmark degradation over time, gaming risk, or strategy for versioning and updates."
    269     },
    270     {
    271       "flag": "CodeBERT rejected without justification",
    272       "detail": "Paper states CodeBERT Score 'showed no correlation' and was rejected, but provides minimal explanation for why code similarity metrics fail on this task."
    273     }
    274   ],
    275   "cited_papers": [
    276     {
    277       "title": "MatPlotAgent: Method and evaluation for LLM-based agentic scientific data visualization",
    278       "relevance": "Direct prior art on LLM evaluation for plotting; PandasPlotBench positions itself as extension (175 tasks vs 25 with data)."
    279     },
    280     {
    281       "title": "Plot2Code: A comprehensive benchmark for evaluating multi-modal large language models in code generation from scientific plots",
    282       "relevance": "Related benchmark with 132 plot-to-code tasks; comparison point on image-guided vs data-driven evaluation approaches."
    283     },
    284     {
    285       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    286       "relevance": "Larger code-generation benchmark (1000 tasks); methodology for evaluating LLM code generation capability."
    287     },
    288     {
    289       "title": "ChartMimic: Evaluating LMM's cross-modal reasoning capability via chart-to-code generation",
    290       "relevance": "1000 chart-to-code examples; alternative approach to visualization code evaluation (image-based vs data-based)."
    291     },
    292     {
    293       "title": "nvBench: A large-scale synthesized dataset for cross-domain natural language to visualization task",
    294       "relevance": "7274 visualizations with NL queries; larger-scale alternative focusing on Vega-Lite and SQL rather than Python plotting."
    295     },
    296     {
    297       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    298       "relevance": "Methodology for evaluating usability of code-generation tools; relevant to construct validity of plotting assistance capability."
    299     },
    300     {
    301       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    302       "relevance": "Code evaluation metric tested and rejected in this work; raises questions about code-level vs task-level evaluation metrics."
    303     }
    304   ],
    305   "engagement_factors": {
    306     "practical_relevance": {
    307       "score": 3,
    308       "justification": "Directly applicable to data visualization workflows; benchmark is immediately usable by practitioners and researchers developing code-generation tools."
    309     },
    310     "surprise_contrarian": {
    311       "score": 1,
    312       "justification": "Findings mostly confirm expected results: Matplotlib better than Plotly (training data bias), task compression doesn't hurt (language models robust to paraphrase). No surprising discoveries."
    313     },
    314     "fear_safety": {
    315       "score": 0,
    316       "justification": "Purely about capability measurement for plotting code. No AI safety, alignment, or risk implications raised."
    317     },
    318     "drama_conflict": {
    319       "score": 0,
    320       "justification": "Straightforward methodology paper. No controversy, conflict, or dramatic findings."
    321     },
    322     "demo_ability": {
    323       "score": 2,
    324       "justification": "Benchmark is open (GitHub, HuggingFace) and runnable, but requires LLM API access and setup; not immediately interactive or beginner-friendly."
    325     },
    326     "brand_recognition": {
    327       "score": 1,
    328       "justification": "JetBrains Research known for tooling but not AI research leadership. MSR is solid software-engineering venue but not top-tier ML. Limited prestige leverage."
    329     }
    330   },
    331   "hn_data": {
    332     "threads": [
    333       {
    334         "hn_id": "38668237",
    335         "title": "Studying stars with central black holes",
    336         "points": 90,
    337         "comments": 96,
    338         "url": "https://news.ycombinator.com/item?id=38668237"
    339       },
    340       {
    341         "hn_id": "42284980",
    342         "title": "Large Language Models as Markov Chains",
    343         "points": 75,
    344         "comments": 54,
    345         "url": "https://news.ycombinator.com/item?id=42284980"
    346       },
    347       {
    348         "hn_id": "42200929",
    349         "title": "Wave Network: An Ultra-Small Language Model",
    350         "points": 27,
    351         "comments": 4,
    352         "url": "https://news.ycombinator.com/item?id=42200929"
    353       },
    354       {
    355         "hn_id": "41782325",
    356         "title": "LLMs as Markov Chains",
    357         "points": 5,
    358         "comments": 0,
    359         "url": "https://news.ycombinator.com/item?id=41782325"
    360       },
    361       {
    362         "hn_id": "42614810",
    363         "title": "Memory Layers at Scale",
    364         "points": 4,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=42614810"
    367       },
    368       {
    369         "hn_id": "42577720",
    370         "title": "Meta: Memory Layers at Scale",
    371         "points": 4,
    372         "comments": 0,
    373         "url": "https://news.ycombinator.com/item?id=42577720"
    374       },
    375       {
    376         "hn_id": "42797706",
    377         "title": "Domain-Specific Tensor Languages",
    378         "points": 1,
    379         "comments": 0,
    380         "url": "https://news.ycombinator.com/item?id=42797706"
    381       },
    382       {
    383         "hn_id": "41777367",
    384         "title": "Large Language Models as Markov Chains",
    385         "points": 1,
    386         "comments": 0,
    387         "url": "https://news.ycombinator.com/item?id=41777367"
    388       },
    389       {
    390         "hn_id": "42204850",
    391         "title": "SEFD: Semantic-Enhanced Framework for Detecting LLM-Generated Text",
    392         "points": 1,
    393         "comments": 0,
    394         "url": "https://news.ycombinator.com/item?id=42204850"
    395       }
    396     ],
    397     "top_points": 90,
    398     "total_points": 208,
    399     "total_comments": 154
    400   }
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs