scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (20755B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Drawing Pandas: A Benchmark for LLMs in Generating Plotting Code",
      6     "authors": [
      7       "Timur Galimzyanov",
      8       "Sergey Titov",
      9       "Yaroslav Golubev",
     10       "Egor Bogomolov"
     11     ],
     12     "year": 2024,
     13     "venue": "IEEE Working Conference on Mining Software Repositories",
     14     "arxiv_id": "2412.02764",
     15     "doi": "10.1109/MSR66628.2025.00083"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims are supported: task compression having 'minimal effect' is shown in Table III (scores drop from 89 to 85 for single sentence), Plotly challenges shown in Table II (22% incorrect code), and benchmark/model comparisons in Table I.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper's causal claims (e.g., 'shortening of tasks has a minimal effect on plotting capabilities') are supported by controlled experimental manipulation — varying one factor at a time while holding others constant. The task length experiment varies only the task prompt while keeping data descriptions and instructions constant.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Claims are generally bounded: 'Our benchmark focuses on generating code for visualizing tabular data—such as a Pandas DataFrame.' Limitations explicitly bound the scope to Python, 175 Matplotlib-derived tasks, and concise DataFrames. However, the benchmark name 'PandasPlotBench' appropriately scopes expectations.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss alternative explanations for key results. For example, Plotly's poor performance could be partly due to the benchmark being derived from Matplotlib gallery tasks (structurally favoring Matplotlib-style plots), but this is mentioned only as a limitation, not as an alternative explanation for the observed result.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper's claims mostly match the granularity of its measurements — it reports scores on its specific benchmark rather than making broad claims about 'coding ability.' The distinction between visual scores and task-based scores (Pearson 0.58 correlation) is explicitly discussed, and both are validated against human judgment.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section V 'Limitations and Future Work' is a dedicated section discussing multiple specific limitations.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section V discusses specific threats: potential bias toward Matplotlib code in training data, possible bias toward OpenAI models from using GPT-4 for task generation, limited dataset size of 175, concise DataFrames not representative of real-world data, and single-person manual scoring.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section V explicitly states scope boundaries: limited to Python ('we limited ourselves to Python code'), only Matplotlib-derived tasks ('Our dataset is currently limited to 175 data points originating from the Matplotlib gallery'), and concise DataFrames ('the DataFrames in the dataset are mostly concise, containing only the data required for plotting').",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding section or acknowledgments mentioning grants, sponsors, or funding agencies. Authors are from JetBrains Research (a corporate lab), but funding is not explicitly disclosed.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations clearly listed: JetBrains Research and Delft University of Technology. JetBrains makes development tools including AI-powered features, which is relevant context.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "While JetBrains makes development tools, the benchmark does not evaluate any JetBrains products. The results do not favor or disfavor JetBrains. The funder has no direct stake in which LLM scores highest.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or financial disclosure section in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "Central terms used without formal definition: 'plotting capabilities', 'visual data exploration', 'benchmark', and 'visualization effectiveness' are used contextually but not precisely defined.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Contribution explicitly stated: 'we introduce the human-curated PandasPlotBench' dataset. Clear that the contribution is a benchmark resource for evaluating LLM plotting code generation.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section IV provides substantive engagement with related benchmarks (MatPlotBench, Plot2Code, ChartMimic, nvBench, DS-1000), showing how PandasPlotBench differs in scope, realism, and approach.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "benchmark-creation": {
    119       "construct_design": {
    120         "construct_validity_argued": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "Paper does not argue why generating plotting code from natural language measures 'effectiveness as visualization assistants'. The connection between code generation ability and broader capability is assumed, not validated.",
    124           "source": "haiku"
    125         },
    126         "difficulty_distribution_characterized": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "No difficulty tiers (easy/medium/hard) defined or measured per item. Performance varies across libraries (Matplotlib 89% vs Plotly 68% task-based) but difficulty not explicitly characterized for individual tasks.",
    130           "source": "haiku"
    131         },
    132         "ceiling_floor_effects_checked": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No explicit analysis of ceiling/floor effects. Best model (GPT-4o) achieves 75/89 visual/task, worst (Llama 1B) 34/40, showing spread, but paper does not discuss or check for discrimination issues.",
    136           "source": "haiku"
    137         },
    138         "human_baseline_included": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Single human rater (first author) validated only GPT-4o results for correlation check (r=0.85 with task-based scoring). No overall human performance reported as benchmark baseline.",
    142           "source": "haiku"
    143         },
    144         "scoring_rubric_justified": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Visual and task-based metrics justified: 'generated images do not have to match ground-truth plots exactly' explains why task-based preferred (r=0.85 vs visual r=0.66 with human raters). Two metrics justified by correlation analysis.",
    148           "source": "haiku"
    149         }
    150       },
    151       "robustness": {
    152         "contamination_resistance_designed": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Synthetic data and tasks designed for contamination resistance: 'benchmark aims to be free from data leakage, as tasks and data files are synthetically generated'. Acknowledged caveat: models may have bias toward Matplotlib code.",
    156           "source": "haiku"
    157         },
    158         "temporal_robustness_discussed": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No discussion of temporal robustness, update plans, or how benchmark will remain useful over time. Future work mentions expanding scope but not maintaining against gaming or obsolescence.",
    162           "source": "haiku"
    163         },
    164         "failure_modes_discussed": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Paper discusses specific failure cases (Plotly 22% error rate from API misuse) but does not discuss failure modes of the benchmark itself—what it cannot measure or what could game it.",
    168           "source": "haiku"
    169         },
    170         "baseline_implementations_provided": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Paper states 'we also open-source our implementation of the baseline methods and the evaluation code, available on our GitHub page' with links to HuggingFace dataset and GitHub repository.",
    174           "source": "haiku"
    175         }
    176       },
    177       "documentation": {
    178         "dataset_documentation_complete": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Section II.A-B provides collection process (Matplotlib gallery scripts→filtering→code splitting→verification), data structure (CSV, scripts, ground truth, three task versions), and generation methodology (GPT-4 splitting, GPT-4V task creation with manual review).",
    182           "source": "haiku"
    183         },
    184         "licensing_and_access_clear": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "Access is clear (HuggingFace and GitHub links provided) but licensing terms are not specified in the paper. Cannot determine from the paper alone what license governs use and redistribution.",
    188           "source": "haiku"
    189         },
    190         "intended_use_specified": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "Intended use implicitly stated: 'assess AI models as assistants in visual data exploration' and 'improve user experience in data visualization'. But paper does not explicitly specify what should NOT be concluded or use limitations.",
    194           "source": "haiku"
    195         }
    196       }
    197     }
    198   },
    199   "claims": [
    200     {
    201       "claim": "Task compression has minimal effect on plotting performance",
    202       "evidence": "Table III: single-sentence task scores 71/85 vs basic task 75/89, only 4-point difference",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "GPT-4o and Claude 3.5 Sonnet are top-performing models",
    207       "evidence": "Table I: GPT-4o 75 visual/89 task, Claude 3.5 Sonnet 73/88, significantly ahead of others",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "LLMs struggle significantly with Plotly library",
    212       "evidence": "Table II: Plotly 59 visual/68 task-based vs Matplotlib 75/89, 22% wrong code generation rate",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "Plotly errors primarily result from API misuse by models",
    217       "evidence": "Manual review of 38 error cases found most 'due to incorrect usage of Plotly library API'",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "Task-based scoring correlates better with human judgment than visual scoring",
    222       "evidence": "Pearson r=0.85 (task-based) vs r=0.66 (visual) against single human rater's assessment",
    223       "supported": "strong"
    224     },
    225     {
    226       "claim": "DataFrame description is more important than task detail length",
    227       "evidence": "Table III: removing generic task drops scores to 44/36, but shortening detailed task only drops 4 points; structured DataFrame description maintains performance",
    228       "supported": "strong"
    229     },
    230     {
    231       "claim": "Synthetic data prevents data contamination compared to real-world training data",
    232       "evidence": "Tasks and DataFrames synthetically generated, never seen by LLMs in training; acknowledged caveat about Matplotlib code bias",
    233       "supported": "moderate"
    234     },
    235     {
    236       "claim": "CodeBERT Score is inadequate for assessing plotting code quality",
    237       "evidence": "CodeBERT showed no correlation with visual or task scores despite measuring code similarity",
    238       "supported": "moderate"
    239     }
    240   ],
    241   "methodology_tags": [
    242     "benchmark-eval",
    243     "case-study"
    244   ],
    245   "key_findings": "The paper introduces PandasPlotBench, a 175-item human-curated dataset for evaluating LLMs' ability to generate plotting code from natural language and Pandas DataFrames. Task compression has minimal impact on performance (71/85 for single-sentence vs 75/89 for detailed tasks), with GPT-4o and Claude 3.5 Sonnet achieving the highest scores; however, LLMs struggle significantly with Plotly (22% error rate) despite strong performance on Matplotlib and Seaborn. Task-based scoring correlates strongly with human judgment (r=0.85), better than visual scoring (r=0.66), and structured DataFrame descriptions matter more than task verbosity for model performance.",
    246   "red_flags": [
    247     {
    248       "flag": "single human rater",
    249       "detail": "Manual validation performed only by first author (5 years experience); no inter-rater reliability reported for full dataset, only correlation check on GPT-4o results"
    250     },
    251     {
    252       "flag": "LLM judge bias",
    253       "detail": "Scoring performed by GPT-4o itself (the top-performing model), potentially biasing results toward its own outputs; cannot assess if this inflates GPT-4o scores"
    254     },
    255     {
    256       "flag": "small benchmark size",
    257       "detail": "175 tasks is modest; limited statistical power for subgroup analysis or detecting small effects"
    258     },
    259     {
    260       "flag": "limited data realism",
    261       "detail": "DataFrames 'mostly concise, containing only the data required for plotting, often far from real-world data'; does not reflect complex messy real-world datasets"
    262     },
    263     {
    264       "flag": "task generation via LLM chain",
    265       "detail": "Three-step process (Matplotlib scripts → GPT-4 code splitting → GPT-4V task generation) with acknowledged information loss; tasks may not accurately reflect original intent"
    266     },
    267     {
    268       "flag": "no ground truth for non-Matplotlib libraries",
    269       "detail": "Ground truth plots always Matplotlib; visual scoring for Seaborn/Plotly inherently biased since scoring compares against Matplotlib plots"
    270     },
    271     {
    272       "flag": "weak correlation between scoring approaches",
    273       "detail": "Pearson r=0.58 between visual and task-based scores; suggests they measure different constructs but paper treats them as complementary rather than investigating divergence"
    274     },
    275     {
    276       "flag": "construct validity not argued",
    277       "detail": "Paper assumes generating plotting code measures 'effectiveness as visualization assistant' but does not argue why code generation equals good assistantship"
    278     },
    279     {
    280       "flag": "potential OpenAI model bias",
    281       "detail": "GPT-4 and GPT-4V used for data processing and task generation; acknowledged as 'potentially negligible' but not empirically tested"
    282     }
    283   ],
    284   "cited_papers": [
    285     {
    286       "title": "MatPlotAgent: Method and evaluation for LLM-based agentic scientific data visualization",
    287       "relevance": "Closely related benchmark for evaluating LLM plotting code generation; methodology inherited from this work"
    288     },
    289     {
    290       "title": "Plot2Code: A comprehensive benchmark for evaluating multi-modal large language models in code generation from scientific plots",
    291       "relevance": "Related benchmark-creation paper; establishes methodology for plotting code evaluation that PandasPlotBench builds on"
    292     },
    293     {
    294       "title": "ChartMimic: Evaluating LMM's cross-modal reasoning capability via chart-to-code generation",
    295       "relevance": "Related benchmark for chart-to-code generation; comparable approach to evaluating visualization code generation"
    296     },
    297     {
    298       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    299       "relevance": "Related benchmark for data science tasks; demonstrates benchmark design for code generation evaluation at scale"
    300     },
    301     {
    302       "title": "nvBench: A large-scale synthesized dataset for cross-domain natural language to visualization task",
    303       "relevance": "Related large-scale benchmark for NL-to-visualization; covers broader scope including SQL and Vega-Lite languages"
    304     },
    305     {
    306       "title": "Leveraging large language models for data analysis automation",
    307       "relevance": "Applications of LLMs in data analysis pipeline; motivates need for evaluation benchmarks"
    308     },
    309     {
    310       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    311       "relevance": "Code similarity metric attempted for benchmark evaluation; found inadequate and abandoned"
    312     }
    313   ],
    314   "engagement_factors": {
    315     "practical_relevance": {
    316       "score": 2,
    317       "justification": "Practitioners using LLMs for data visualization can use this benchmark to evaluate tools; the task compression finding is directly useful for UI design."
    318     },
    319     "surprise_contrarian": {
    320       "score": 1,
    321       "justification": "Plotly underperformance is expected given training data distribution; the minimal effect of task compression is mildly interesting but not surprising."
    322     },
    323     "fear_safety": {
    324       "score": 0,
    325       "justification": "No AI safety, security, or risk concerns raised by this work."
    326     },
    327     "drama_conflict": {
    328       "score": 0,
    329       "justification": "No controversy or conflict in the findings."
    330     },
    331     "demo_ability": {
    332       "score": 2,
    333       "justification": "Code and dataset publicly available on GitHub and HuggingFace; researchers can run the benchmark but it's not pip-installable."
    334     },
    335     "brand_recognition": {
    336       "score": 1,
    337       "justification": "JetBrains is well-known in developer tooling but not a top-tier AI research lab; evaluates recognizable models (GPT-4o, Claude)."
    338     }
    339   },
    340   "hn_data": {
    341     "threads": [
    342       {
    343         "hn_id": "38668237",
    344         "title": "Studying stars with central black holes",
    345         "points": 90,
    346         "comments": 96,
    347         "url": "https://news.ycombinator.com/item?id=38668237"
    348       },
    349       {
    350         "hn_id": "42284980",
    351         "title": "Large Language Models as Markov Chains",
    352         "points": 75,
    353         "comments": 54,
    354         "url": "https://news.ycombinator.com/item?id=42284980"
    355       },
    356       {
    357         "hn_id": "42200929",
    358         "title": "Wave Network: An Ultra-Small Language Model",
    359         "points": 27,
    360         "comments": 4,
    361         "url": "https://news.ycombinator.com/item?id=42200929"
    362       },
    363       {
    364         "hn_id": "41782325",
    365         "title": "LLMs as Markov Chains",
    366         "points": 5,
    367         "comments": 0,
    368         "url": "https://news.ycombinator.com/item?id=41782325"
    369       },
    370       {
    371         "hn_id": "42614810",
    372         "title": "Memory Layers at Scale",
    373         "points": 4,
    374         "comments": 0,
    375         "url": "https://news.ycombinator.com/item?id=42614810"
    376       },
    377       {
    378         "hn_id": "42577720",
    379         "title": "Meta: Memory Layers at Scale",
    380         "points": 4,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=42577720"
    383       },
    384       {
    385         "hn_id": "42797706",
    386         "title": "Domain-Specific Tensor Languages",
    387         "points": 1,
    388         "comments": 0,
    389         "url": "https://news.ycombinator.com/item?id=42797706"
    390       },
    391       {
    392         "hn_id": "41777367",
    393         "title": "Large Language Models as Markov Chains",
    394         "points": 1,
    395         "comments": 0,
    396         "url": "https://news.ycombinator.com/item?id=41777367"
    397       },
    398       {
    399         "hn_id": "42204850",
    400         "title": "SEFD: Semantic-Enhanced Framework for Detecting LLM-Generated Text",
    401         "points": 1,
    402         "comments": 0,
    403         "url": "https://news.ycombinator.com/item?id=42204850"
    404       }
    405     ],
    406     "top_points": 90,
    407     "total_points": 208,
    408     "total_comments": 154
    409   }
    410 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs