scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29370B)
      1 {
      2   "paper": {
      3     "title": "Drawing Pandas: A Benchmark for LLMs in Generating Plotting Code",
      4     "authors": [
      5       "Timur Galimzyanov",
      6       "Sergey Titov",
      7       "Yaroslav Golubev",
      8       "Egor Bogomolov"
      9     ],
     10     "year": 2024,
     11     "venue": "IEEE Working Conference on Mining Software Repositories",
     12     "arxiv_id": "2412.02764",
     13     "doi": "10.1109/MSR66628.2025.00083"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "PandasPlotBench is a 175-task human-curated benchmark for evaluating LLMs on generating plotting code from Pandas DataFrames. GPT-4o and Claude 3.5 Sonnet lead with ~89 task-based mean scores, while significant task compression (down to single sentences) minimally affects performance. Plotly generation is notably harder with 22% incorrect code vs 1.8% for Matplotlib. Large open-source models (Llama 3.1 70B, 405B) approach proprietary model performance.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "GitHub repository provided: https://github.com/JetBrains-Research/PandasPlotBench. The paper states 'we also open-source our implementation of the baseline methods and the evaluation code, available on our GitHub page.'"
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Dataset released on HuggingFace: https://huggingface.co/datasets/JetBrains-Research/PandasPlotBench. Explicitly stated in abstract and Section II-A."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, or detailed environment specification in the paper. The paper mentions using a Jupyter Notebook environment but provides no library versions or dependency details."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper provides no step-by-step reproduction instructions. It links to GitHub and supplementary materials but does not include a 'Reproducing Results' section or specific commands to run the benchmark."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Despite running each experiment 5 times, the paper reports only mean scores and ratios in Tables I-III. No confidence intervals, error bars, or ± notation are provided."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims GPT-4o and Claude 3.5 Sonnet 'share the highest scores' and that task compression has 'minimal effect' without any statistical significance tests. Comparative claims are based solely on comparing raw numbers."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Results are presented as raw scores (0-100 scale) in tables without explicit effect size calculations, percentage improvements, or baseline-relative measures. The reader can compare numbers but no effect sizes are computed or discussed."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The dataset contains 175 data points reduced from 501 through filtering, but no justification is given for why 175 is sufficient for the claims being made. No power analysis is discussed."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper states 'we ran the benchmark five times for each factor to ensure the robustness of our results' but reports only mean scores. No standard deviations, IQR, or any spread measure is reported across the 5 runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Multiple models compared: GPT-4o, Claude 3.5 Sonnet, Claude 3 Opus, Claude 3 Haiku, Gemini 1.5 Pro, and Llama 3.1/3.2 variants (Table I). A 'no task' control condition is also included (Table III)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "All tested models are contemporary (2024 vintage): GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Pro, Llama 3.1/3.2 family. These represent state-of-the-art at time of writing."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper systematically varies prompt components: DataFrame description format (Section III), task length with 5 levels from full to none (Table III), and target plotting library (Table II). These serve as ablations of the prompt design."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Three metrics used: visual scoring (0-100 comparison to ground truth), task-based scoring (0-100 adherence to task description), and incorrect code ratio. Ratio of 'good' scores (≥75) also reported."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section II-C: 'The first author of the paper, with 5 years of experience in Python and data visualization, went through all the data points and manually scored them judging the adherence of the resulting plot to the task description.' Human-LLM score correlation analyzed (Pearson 0.85 for task-based)."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The paper uses the same 175 tasks for both selecting the best DataFrame description method ('we experimented with different DataFrame descriptions and found that the head(5) method... delivered the best performance') and reporting final results. No dev/test split."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results broken down by model (Table I), by plotting library (Table II), and by task length (Table III). These provide meaningful category-level insights into where performance varies."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section III-B: 'We manually reviewed these cases and found that most errors were due to the incorrect usage of the Plotly library API by the language model.' Plotly failure analysis and small Llama model failures discussed."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Several negative results: Plotly's 22% incorrect code rate, small Llama models 'often fail to produce functional code' (Table I), the 'no task' condition producing only 23% good scores, and CodeBERT Score found inadequate and abandoned (Section II-C)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims are supported: task compression having 'minimal effect' is shown in Table III (scores drop from 89 to 85 for single sentence), Plotly challenges shown in Table II (22% incorrect code), and benchmark/model comparisons in Table I."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper's causal claims (e.g., 'shortening of tasks has a minimal effect on plotting capabilities') are supported by controlled experimental manipulation — varying one factor at a time while holding others constant. The task length experiment varies only the task prompt while keeping data descriptions and instructions constant."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Claims are generally bounded: 'Our benchmark focuses on generating code for visualizing tabular data—such as a Pandas DataFrame.' Limitations explicitly bound the scope to Python, 175 Matplotlib-derived tasks, and concise DataFrames. However, the benchmark name 'PandasPlotBench' appropriately scopes expectations."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not discuss alternative explanations for key results. For example, Plotly's poor performance could be partly due to the benchmark being derived from Matplotlib gallery tasks (structurally favoring Matplotlib-style plots), but this is mentioned only as a limitation, not as an alternative explanation for the observed result."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper's claims mostly match the granularity of its measurements — it reports scores on its specific benchmark rather than making broad claims about 'coding ability.' The distinction between visual scores and task-based scores (Pearson 0.58 correlation) is explicitly discussed, and both are validated against human judgment."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Models listed as 'GPT-4o', 'Claude 3.5 Sonnet', 'Gemini 1.5 Pro' etc. without snapshot dates or API versions. Per the schema, marketing names without snapshot dates do not count as specified versions."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper states: 'For the full list of instruction prompts used for processing the data, generating the results, and scoring... please refer to the supplementary materials' with a link to GitHub repository containing the actual prompts."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No temperature, top-p, max tokens, or any other API hyperparameters are reported for any of the tested models."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. Models receive prompts directly and return code in a single turn."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section II-A documents the full pipeline with counts at each stage: 501 gallery scripts → 307 valid after execution → 201 after GPT-4 splitting and manual review → 175 after final validation. Each step's criteria are described."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section V 'Limitations and Future Work' is a dedicated section discussing multiple specific limitations."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section V discusses specific threats: potential bias toward Matplotlib code in training data, possible bias toward OpenAI models from using GPT-4 for task generation, limited dataset size of 175, concise DataFrames not representative of real-world data, and single-person manual scoring."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section V explicitly states scope boundaries: limited to Python ('we limited ourselves to Python code'), only Matplotlib-derived tasks ('Our dataset is currently limited to 175 data points originating from the Matplotlib gallery'), and concise DataFrames ('the DataFrames in the dataset are mostly concise, containing only the data required for plotting')."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full dataset including CSV files, tasks, and ground truth plots is available on HuggingFace. Evaluation code and supplementary materials available on GitHub."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section II-A describes the complete data collection procedure: sourcing from Matplotlib gallery, script execution in Jupyter, GPT-4 splitting, manual verification at each step, and final validation."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants were recruited. Data sourced from the public Matplotlib gallery. The single human evaluator is the first author, not a recruited participant."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Full pipeline documented with counts at each stage: 501 → 307 (after execution filtering) → 201 (after GPT-4 splitting + manual review) → 175 (after final validation). Each transformation step is described in Section II-A."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding section or acknowledgments mentioning grants, sponsors, or funding agencies. Authors are from JetBrains Research (a corporate lab), but funding is not explicitly disclosed."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations clearly listed: JetBrains Research and Delft University of Technology. JetBrains makes development tools including AI-powered features, which is relevant context."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "While JetBrains makes development tools, the benchmark does not evaluate any JetBrains products. The results do not favor or disfavor JetBrains. The funder has no direct stake in which LLM scores highest."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial disclosure section in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for any of the evaluated models (GPT-4o, Claude, Gemini, Llama)."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section V discusses potential overlap: 'models may still exhibit bias towards publicly available ground-truth Matplotlib code. However, this code has been modified, and solutions using other libraries remain unseen.' Also: 'our tasks are less prone to models memorizing training data.'"
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "The paper explicitly designs for contamination resistance: 'Our benchmark aims to be free from data leakage, as the tasks and data files are synthetically generated.' Section V acknowledges the Matplotlib gallery source code may be in training data but argues the tasks and solutions are novel."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study. The benchmark evaluation involves only LLM outputs and automated/author scoring."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The single human scorer is the first author evaluating LLM outputs."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in the study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Table I reports wall-clock time per item (s/item) for each model, ranging from 0.3s (Llama 3.2 1B) to 14.0s (Claude 3 Opus). However, no API costs or token counts are provided."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget stated. No GPU hours, total API costs, or hardware specifications reported beyond per-item latency."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper ran each experiment 5 times but reports only mean scores. No seed sensitivity analysis or per-run results are provided."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section III: 'For all experiments, we ran the benchmark five times for each factor to ensure the robustness of our results.'"
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper experimented with different DataFrame description formats but does not report how many configurations were tried or the search budget."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The best DataFrame description method (head(5) + column names/types) was selected by testing on the same 175-task dataset used for final reporting. No validation/test split was used, and not all configurations tried are reported."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper compares 10 models and multiple experimental conditions without any statistical significance tests, let alone multiple comparison corrections."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors created the benchmark and evaluate all models on it without discussing potential author-evaluation bias or having independent evaluators."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "While per-item latency is reported alongside scores in Table I, there is no analysis of performance as a function of compute budget or comparison at matched compute levels."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper does not discuss whether its benchmark (175 Matplotlib-derived tasks with concise DataFrames) actually measures the claimed capability of 'effectiveness as assistants in visual data exploration.' The gap between controlled benchmark tasks and real-world data visualization is not analyzed."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding involved. Models are tested directly via API with the same prompt structure."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal aspects — when the Matplotlib gallery code was published vs. when model training data was collected. The argument focuses on synthetic generation but ignores the timeline."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks answer information through context. The DataFrame descriptions and task prompts could contain information that makes generation trivially easy."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether training data and test examples share structural similarities. The benchmark tasks derive from the public Matplotlib gallery, which models may have seen variants of during training."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection method applied (no canary strings, membership inference, n-gram overlap analysis). The paper argues contamination resistance by design but does not test for it."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Task compression (shortening instructions to a single sentence) has minimal effect on LLM plotting capabilities.",
    370       "evidence": "Table III shows task-based mean score drops from 89 (full task) to 85 (single sentence), and visual score from 75 to 71. Good score ratio drops from 0.91 to 0.85 (task-based).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "GPT-4o and Claude 3.5 Sonnet are the highest-scoring models for plotting code generation.",
    375       "evidence": "Table I: GPT-4o achieves 89 task-based mean, Claude 3.5 Sonnet achieves 88. Both outperform other models.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "LLMs struggle significantly with Plotly compared to Matplotlib and Seaborn.",
    380       "evidence": "Table II: Plotly has 22% incorrect code vs 1.8% for Matplotlib, and task-based mean score of 68 vs 89 for Matplotlib.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Large open-source Llama models approach proprietary model performance.",
    385       "evidence": "Table I: Llama 3.1 405B achieves 86 task-based mean vs 89 for GPT-4o; Llama 3.1 70B achieves 82.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "GPT-4o judge's task-based scoring correlates strongly with human evaluation (Pearson r=0.85).",
    390       "evidence": "Section II-C reports Pearson correlation of 0.85 between human scores and task-based scores, validated by first author scoring all data points.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "The benchmark is free from data leakage by virtue of being synthetically generated.",
    395       "evidence": "Section I states tasks and data files are synthetically generated. Section V argues there is 'no direct link between the answers to benchmark tasks and the raw repository data.'",
    396       "supported": "weak"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Single human evaluator",
    402       "detail": "Human validation was performed by only the first author. No inter-rater reliability measured. The paper acknowledges this: 'the manual scoring of the data points was carried out only by the first author, as a general sanity check.' This is insufficient to validate the benchmark's scoring."
    403     },
    404     {
    405       "flag": "GPT-4o as both contestant and judge",
    406       "detail": "GPT-4o is the highest-scoring model in Table I and is also used as the judge (GPT-4o Judge) for scoring all models. This creates a potential self-preference bias where GPT-4o may rate its own outputs more favorably."
    407     },
    408     {
    409       "flag": "No variance reported despite multiple runs",
    410       "detail": "Each experiment was run 5 times but only mean scores are reported with no standard deviations, confidence intervals, or any spread measure. The reader cannot assess whether observed differences are meaningful or within noise."
    411     },
    412     {
    413       "flag": "Configuration selection on test data",
    414       "detail": "The best DataFrame description format was selected by testing on the same 175 tasks used for final reporting. No held-out validation set was used, risking optimistic results."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "MatPlotAgent: Method and evaluation for LLM-based agentic scientific data visualization",
    420       "authors": ["Z. Yang", "Z. Zhou", "S. Wang"],
    421       "year": 2024,
    422       "arxiv_id": "2402.11453",
    423       "relevance": "Directly related benchmark for LLM-based data visualization with agentic methods; primary comparison point for this work."
    424     },
    425     {
    426       "title": "Plot2Code: A comprehensive benchmark for evaluating multi-modal large language models in code generation from scientific plots",
    427       "authors": ["C. Wu", "Y. Ge", "Q. Guo"],
    428       "year": 2024,
    429       "arxiv_id": "2405.07990",
    430       "relevance": "Benchmark for multi-modal LLM code generation from plots; closely related evaluation of LLM plotting capabilities."
    431     },
    432     {
    433       "title": "ChartMimic: Evaluating LMM's cross-modal reasoning capability via chart-to-code generation",
    434       "authors": ["C. Shi", "C. Yang", "Y. Liu"],
    435       "year": 2024,
    436       "arxiv_id": "2406.09961",
    437       "relevance": "1,000 human-curated examples for chart-to-code generation; directly comparable benchmark for LLM visualization capabilities."
    438     },
    439     {
    440       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    441       "authors": ["Y. Lai", "C. Li", "Y. Wang"],
    442       "year": 2023,
    443       "relevance": "Data science code generation benchmark including Matplotlib tasks; establishes baseline for LLM code generation evaluation."
    444     },
    445     {
    446       "title": "nvBench: A large-scale synthesized dataset for cross-domain natural language to visualization task",
    447       "authors": ["Y. Luo", "J. Tang", "G. Li"],
    448       "year": 2021,
    449       "arxiv_id": "2112.12926",
    450       "relevance": "Large-scale NL-to-visualization benchmark with 7,274 visualizations; related evaluation of automated visualization generation."
    451     },
    452     {
    453       "title": "Leveraging large language models for data analysis automation",
    454       "authors": ["J. A. Jansen", "A. Manukyan", "N. Al Khoury", "A. Akalin"],
    455       "year": 2025,
    456       "relevance": "Studies LLM automation of data analysis pipelines including visualization; directly relevant to LLM code generation capabilities."
    457     },
    458     {
    459       "title": "LLMs for science: Usage for code generation and data analysis",
    460       "authors": ["M. Nejjar", "L. Zacharias", "F. Stiehle", "I. Weber"],
    461       "year": 2023,
    462       "relevance": "Evaluates LLM usage for scientific code generation and data analysis; relevant to understanding LLM coding capabilities."
    463     },
    464     {
    465       "title": "The Llama 3 herd of models",
    466       "authors": ["A. Dubey", "A. Jauhri", "A. Pandey"],
    467       "year": 2024,
    468       "arxiv_id": "2407.21783",
    469       "relevance": "Open-source LLM family evaluated in this benchmark; important for understanding open vs proprietary model performance gaps."
    470     },
    471     {
    472       "title": "GPT-4 technical report",
    473       "authors": ["J. Achiam", "S. Adler", "S. Agarwal"],
    474       "year": 2023,
    475       "arxiv_id": "2303.08774",
    476       "relevance": "Technical report for GPT-4, used for dataset construction; central model in LLM capability evaluation."
    477     },
    478     {
    479       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    480       "authors": ["P. Vaithilingam", "T. Zhang", "E. L. Glassman"],
    481       "year": 2022,
    482       "relevance": "User study evaluating LLM code generation tools; provides context on challenges in producing fully executable code."
    483     },
    484     {
    485       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    486       "authors": ["Z. Feng", "D. Guo", "D. Tang"],
    487       "year": 2020,
    488       "arxiv_id": "2002.08155",
    489       "relevance": "Pre-trained code model whose CodeBERT Score metric was evaluated and rejected in this benchmark's development."
    490     }
    491   ],
    492   "engagement_factors": {
    493     "practical_relevance": {
    494       "score": 2,
    495       "justification": "Practitioners using LLMs for data visualization can use this benchmark to evaluate tools; the task compression finding is directly useful for UI design."
    496     },
    497     "surprise_contrarian": {
    498       "score": 1,
    499       "justification": "Plotly underperformance is expected given training data distribution; the minimal effect of task compression is mildly interesting but not surprising."
    500     },
    501     "fear_safety": {
    502       "score": 0,
    503       "justification": "No AI safety, security, or risk concerns raised by this work."
    504     },
    505     "drama_conflict": {
    506       "score": 0,
    507       "justification": "No controversy or conflict in the findings."
    508     },
    509     "demo_ability": {
    510       "score": 2,
    511       "justification": "Code and dataset publicly available on GitHub and HuggingFace; researchers can run the benchmark but it's not pip-installable."
    512     },
    513     "brand_recognition": {
    514       "score": 1,
    515       "justification": "JetBrains is well-known in developer tooling but not a top-tier AI research lab; evaluates recognizable models (GPT-4o, Claude)."
    516     }
    517   }
    518 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs