scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30166B)
      1 {
      2   "paper": {
      3     "title": "MatPlotAgent: Method and Evaluation for LLM-Based Agentic Scientific Data Visualization",
      4     "authors": [
      5       "Zhiyu Yang",
      6       "Zihan Zhou",
      7       "Shuo Wang",
      8       "Xin Cong",
      9       "Xu Han",
     10       "Yukun Yan",
     11       "Zhenghao Liu",
     12       "Zhixing Tan",
     13       "Pengyuan Liu",
     14       "Dong Yu",
     15       "Zhiyuan Liu",
     16       "Xiaodong Shi",
     17       "Maosong Sun"
     18     ],
     19     "year": 2024,
     20     "venue": "Annual Meeting of the Association for Computational Linguistics",
     21     "arxiv_id": "2402.11453",
     22     "doi": "10.48550/arXiv.2402.11453"
     23   },
     24   "scan_version": 2,
     25   "active_modules": ["experimental_rigor", "data_leakage"],
     26   "methodology_tags": ["benchmark-eval"],
     27   "key_findings": "MatPlotAgent, an LLM agent framework combining code generation with iterative debugging and visual feedback from multi-modal LLMs, improves scientific data visualization performance across multiple LLMs (up to +13.21 points on MatPlotBench). The proposed automatic evaluation using GPT-4V correlates strongly with human evaluation (r=0.876 for GPT-4, r=0.836 for GPT-3.5). The visual feedback mechanism is the key driver of improvement; without it, gains are roughly halved. Zero-shot chain-of-thought prompting generally hurts performance on this task.",
     28   "checklist": {
     29     "artifacts": {
     30       "code_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "GitHub repository URL provided in footnote 1: 'MatPlotAgent and MatPlotBench are publicly available at https://github.com/thunlp/MatPlotAgent.'"
     34       },
     35       "data_released": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "MatPlotBench (100 test cases with queries, raw data, and ground-truth figures) is released via the same GitHub repository."
     39       },
     40       "environment_specified": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided in the paper. Only mentions using vLLM for inference and scipy for statistics, without version specifications."
     44       },
     45       "reproduction_instructions": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No step-by-step reproduction instructions are provided in the paper. The framework is described conceptually but there are no specific commands or scripts to replicate experiments."
     49       }
     50     },
     51     "statistical_methodology": {
     52       "confidence_intervals_or_error_bars": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "All results in Tables 1-4 are reported as point estimates only (e.g., 48.86, 61.16). No confidence intervals or error bars are provided for the main performance comparisons."
     56       },
     57       "significance_tests": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No significance tests are used for the main performance comparisons in Tables 1-4. Claims like 'MatPlotAgent leads to significant improvements of 12.30 and 9.48' are based solely on comparing raw numbers. Pearson correlation (r, p-value) is used only for validating the automatic evaluation mechanism, not for model comparisons."
     61       },
     62       "effect_sizes_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Absolute improvements are reported with baseline context throughout: e.g., 'GPT-4: 48.86 → 61.16 (+12.30)', 'GPT-3.5: 38.03 → 47.51 (+9.48)' in Tables 1-4. The reader can assess the magnitude of improvement relative to the baseline."
     66       },
     67       "sample_size_justified": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "MatPlotBench contains 100 test examples. No justification is given for why 100 was chosen, no power analysis, and no discussion of whether 100 is sufficient to distinguish model capabilities."
     71       },
     72       "variance_reported": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No variance or standard deviation is reported across experimental runs. The paper states temperature is set to 0.0 for code LLMs, implying single-run results, but does not report variance or verify determinism for API-based models."
     76       }
     77     },
     78     "evaluation_design": {
     79       "baselines_included": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple baselines are compared: direct decoding (no augmentation), zero-shot chain-of-thought, and MatPlotAgent. Results are shown for 7 different LLMs in Table 1."
     83       },
     84       "baselines_contemporary": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Baselines include GPT-4, GPT-3.5, and five recent open-source code LLMs (Magicoder, DeepSeek-Coder, CodeLlama, WizardCoder), all published in 2023-2024 and representing the state of the art at the time."
     88       },
     89       "ablation_study": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Table 4 presents an ablation study showing the effect of removing the visual feedback mechanism. For GPT-4: 61.16 (full) vs 53.44 (w/o visual feedback) vs 48.86 (direct). Table 3 also ablates visual feedback on Qwen-Agent benchmark."
     93       },
     94       "multiple_metrics": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "On MatPlotBench, only a single metric is used: the GPT-4V automatic score (0-100). Human evaluation is used solely to validate the automatic scoring mechanism, not as an independent evaluation metric. The Qwen-Agent benchmark uses accuracy, but that's a separate benchmark."
     98       },
     99       "human_evaluation": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 3.2 describes human evaluation of model-generated figures. Human annotators score outputs, and these scores are correlated with automatic evaluation. Section B provides details on annotator recruitment and compensation."
    103       },
    104       "held_out_test_set": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "MatPlotBench is a dedicated test set of 100 examples not used for any tuning or development decisions. MatPlotAgent's design is fixed before evaluation."
    108       },
    109       "per_category_breakdown": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "On MatPlotBench, only aggregate scores per model are reported. No breakdown by plot type, difficulty level, or source (Matplotlib vs OriginLab) despite the benchmark containing diverse categories. The Qwen-Agent results in Table 3 have a hard/easy split, but not the main benchmark."
    113       },
    114       "failure_cases_discussed": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 5.5 (Case Study) shows examples of varying difficulty including failures. Figure 5 displays cases where 'none of the three models can produce the correct result.' Figure 4 shows outputs before and after visual feedback."
    118       },
    119       "negative_results_reported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Table 1 shows zero-shot CoT hurts performance for most models (e.g., GPT-4 drops from 48.86 to 45.42). MatPlotAgent with CodeLlama-34B-Instruct also produces worse results (-2.36). These negative results are reported without omission."
    123       }
    124     },
    125     "claims_and_evidence": {
    126       "abstract_claims_supported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Abstract claims that MatPlotAgent 'can improve the performance of various LLMs' are supported by Tables 1-3. The claim of 'strong correlation with human-annotated scores' is supported by r=0.876 (GPT-4) and r=0.836 (GPT-3.5) in Section 3.2."
    130       },
    131       "causal_claims_justified": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper claims MatPlotAgent 'improves' performance. The ablation study (Table 4) removes the visual feedback component in a controlled manner. Model comparisons use the same benchmark with/without the framework. This controlled single-variable manipulation is adequate for the causal claims made."
    135       },
    136       "generalization_bounded": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The title claims 'Scientific Data Visualization' broadly, but MatPlotBench contains only 100 examples from Matplotlib Gallery (75) and OriginLab GraphGallery (25). The limitations section acknowledges domain-specific limitations but does not bound the results to the specific visualization types and libraries tested."
    140       },
    141       "alternative_explanations_discussed": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "No alternative explanations are discussed for the observed improvements. For example, the paper does not consider whether the improvement from visual feedback is due to the additional LLM call itself (more compute) rather than the visual signal, or whether prompt quality variations explain the results."
    145       },
    146       "proxy_outcome_distinction": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The paper explicitly validates its proxy metric (GPT-4V automatic scoring) against human evaluation, demonstrating strong correlation (r=0.876). The paper acknowledges the automatic score is a proxy and provides the correlation analysis as evidence of its validity."
    150       }
    151     },
    152     "setup_transparency": {
    153       "model_versions_specified": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Commercial models are referred to only as 'GPT-4', 'GPT-3.5', and 'GPT-4V' without API version or snapshot date (e.g., no 'gpt-4-0613'). Open-source models have specific identifiers (e.g., 'Magicoder-S-DS-6.7B') but OpenAI and Google models lack version specificity."
    157       },
    158       "prompts_provided": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The appendix provides full prompt text for all components: automatic evaluation (Figure 6), query expansion (Figure 7), code generation (Figure 8), self-debugging (Figure 9), and visual agent (Figure 10). These are actual prompts, not just descriptions."
    162       },
    163       "hyperparameters_reported": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "Temperature is reported as 0.0 for code LLMs, and correlation experiment parameters (n=25, k=100) are stated. However, temperature and sampling settings for GPT-4V (visual agent and evaluator) and Gemini Pro Vision are not reported."
    167       },
    168       "scaffolding_described": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "MatPlotAgent's scaffolding is described in detail in Section 4: three modules (query expansion, code agent with self-debugging up to 3 iterations, visual agent for feedback). Figure 3 illustrates the full workflow with data flow between components."
    172       },
    173       "data_preprocessing_documented": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 3.1 documents the full benchmark construction process: source selection from Matplotlib/OriginLab galleries, LLM-based preliminary query generation, data replacement to address memorization, human modification by annotators with ≥3 years experience, and human verification by 3 NLP researchers."
    177       }
    178     },
    179     "limitations_and_scope": {
    180       "limitations_section_present": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 8 is titled 'Limitations' and discusses the domain-specific coverage limitations of MatPlotBench."
    184       },
    185       "threats_to_validity_specific": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The limitations section mentions only that 'the demands of scientific data visualization can vary significantly across disciplines' and MatPlotBench 'may not encompass all domain-specific requirements.' This is a single generic concern without naming specific domains, specific failure modes, or quantifying the coverage gap."
    189       },
    190       "scope_boundaries_stated": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The paper does not explicitly state what the results do NOT show. No specific exclusions are listed for what was not tested (e.g., interactive visualizations, non-Python libraries, domain-specific visualization requirements in biology/physics/etc.)."
    194       }
    195     },
    196     "data_integrity": {
    197       "raw_data_available": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "MatPlotBench data (100 queries, raw data files, and ground-truth figures) is released via the GitHub repository, enabling independent verification of the benchmark and reproduction of results."
    201       },
    202       "data_collection_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 3.1 describes data collection in detail: 75 examples selected from Matplotlib Gallery (1-2 per section), 25 from OriginLab GraphGallery, with specific selection principles (diverse types, representative instances, balanced difficulty)."
    206       },
    207       "recruitment_methods_described": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Appendix B states 'We engage human annotators from computer science departments at various universities via social media.' Annotator qualifications (minimum 3 years coding/NLP experience) are specified in Section 3.1. Compensation is noted as 'slightly higher than the prevailing market rate.'"
    211       },
    212       "data_pipeline_documented": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Section 3.1 documents the full pipeline: selecting original examples → preliminary query generation (GPT-4/GPT-4V) → data replacement (Matplotlib only) → human modification (2 independent annotators) → updating ground-truth figures → human verification (3 NLP researchers) → 100 final examples."
    216       }
    217     },
    218     "conflicts_of_interest": {
    219       "funding_disclosed": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section mentioning grants, sponsors, or funding agencies is present."
    223       },
    224       "affiliations_disclosed": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Author affiliations are clearly listed: Tsinghua University, Beijing Language and Culture University, Xiamen University, Northeastern University (China), and Zhongguancun Laboratory. All are academic institutions, not companies whose products are evaluated."
    228       },
    229       "funder_independent_of_outcome": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No funding is disclosed, so independence of funding cannot be assessed. The paper evaluates products from OpenAI and Google; without funding disclosure, potential financial relationships cannot be verified."
    233       },
    234       "financial_interests_declared": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No competing interests or financial interests statement is present in the paper. Absence of a disclosure statement is noted."
    238       }
    239     },
    240     "contamination": {
    241       "training_cutoff_stated": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No training data cutoff dates are stated for any model (GPT-4, GPT-3.5, or open-source models). This is relevant since MatPlotBench draws from publicly available Matplotlib Gallery examples."
    245       },
    246       "train_test_overlap_discussed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "Section 3.1 explicitly discusses memorization: 'we begin data replacement for examples from the Matplotlib Gallery due to the observed phenomenon of memorization by GPT-4.' For OriginLab examples: 'the data is inherently complex, and even GPT-4 does not exhibit memorization with these examples.'"
    250       },
    251       "benchmark_contamination_addressed": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Data replacement is performed specifically to mitigate contamination from GPT-4's memorization of Matplotlib Gallery examples (Section 3.1). Original data points are replaced with new ones while keeping plot types unchanged."
    255       }
    256     },
    257     "human_studies": {
    258       "pre_registered": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants as study subjects. Human annotators create and evaluate the benchmark but are not subjects of study."
    262       },
    263       "irb_or_ethics_approval": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants as study subjects. Human annotators serve as experts for benchmark construction and evaluation, not as research subjects."
    267       },
    268       "demographics_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants as study subjects. Annotator qualifications (CS departments, ≥3 years experience) are described but these are evaluators, not subjects."
    272       },
    273       "inclusion_exclusion_criteria": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants as study subjects."
    277       },
    278       "randomization_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants as study subjects. This is a benchmark evaluation, not a human-subjects experiment."
    282       },
    283       "blinding_described": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants as study subjects."
    287       },
    288       "attrition_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "No human participants as study subjects."
    292       }
    293     },
    294     "cost_and_practicality": {
    295       "inference_cost_reported": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No API costs, token counts, or wall-clock time are reported. MatPlotAgent makes multiple LLM calls per example (query expansion + code generation + up to 3 self-debugging iterations + visual feedback), but the cost implications are not quantified."
    299       },
    300       "compute_budget_stated": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No compute budget is stated. The paper mentions using vLLM for open-source model inference but does not specify hardware, GPU hours, or total API spend."
    304       }
    305     },
    306     "experimental_rigor": {
    307       "seed_sensitivity_reported": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Results appear to be single-run. Temperature is set to 0.0, suggesting deterministic decoding, but no verification of result stability is provided and API-based models may still exhibit variation."
    311       },
    312       "number_of_runs_stated": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The number of experimental runs is not explicitly stated. Results appear to be from single runs, but this is not confirmed in the text."
    316       },
    317       "hyperparameter_search_budget": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No hyperparameter search budget is reported. Design choices such as max debugging iterations (3) and correlation experiment parameters (n=25, k=100) appear fixed without justification or search."
    321       },
    322       "best_config_selection_justified": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "MatPlotAgent is presented as a fixed framework. No justification is given for specific design choices like the maximum of 3 self-debugging iterations, and no alternative configurations are explored."
    326       },
    327       "multiple_comparison_correction": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Seven models are compared across three conditions (Tables 1-2), generating numerous comparisons. No correction for multiple comparisons is applied, and no significance tests are used for the model comparisons at all."
    331       },
    332       "self_comparison_bias_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The authors evaluate their own MatPlotAgent framework and MatPlotBench benchmark. They implement all baselines (direct decoding, zero-shot CoT) and do not acknowledge the potential bias of evaluating their own system."
    336       },
    337       "compute_budget_vs_performance": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "MatPlotAgent uses substantially more compute than direct decoding (multiple LLM calls including a multi-modal model for visual feedback) but performance is not analyzed as a function of compute cost. No fair-compute-budget comparison is provided."
    341       },
    342       "benchmark_construct_validity": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "The paper validates the automatic scoring mechanism against human judgment (Section 3.2) but does not discuss whether matching ground-truth figures is a valid measure of scientific data visualization capability, or whether the 100 examples adequately represent the space of visualization tasks."
    346       },
    347       "scaffold_confound_addressed": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "When comparing models with MatPlotAgent (Table 1, last column), all models use the same scaffold (MatPlotAgent framework + GPT-4V visual agent), controlling for scaffold differences. Table 2 separately tests Gemini Pro Vision as an alternative visual agent."
    351       }
    352     },
    353     "data_leakage": {
    354       "temporal_leakage_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section 3.1 addresses temporal leakage by performing data replacement for Matplotlib Gallery examples after observing GPT-4 memorization. They replace original data points with new ones to mitigate the risk that models trained on public gallery examples would have seen solutions."
    358       },
    359       "feature_leakage_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether the evaluation setup leaks answer information. While the visual agent does not see ground truth when generating feedback, no formal analysis of potential feature leakage in the evaluation pipeline is provided."
    363       },
    364       "non_independence_addressed": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No discussion of whether the 100 benchmark examples are independent. Examples are drawn from the same two galleries (Matplotlib, OriginLab) and could share structural similarities, but this is not addressed."
    368       },
    369       "leakage_detection_method": {
    370         "applies": true,
    371         "answer": true,
    372         "justification": "The authors tested for memorization directly: they observed GPT-4 memorizing Matplotlib examples and applied data replacement as mitigation. For OriginLab examples, they verified 'even GPT-4 does not exhibit memorization with these examples' (Section 3.1)."
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "MatPlotAgent improves the performance of various LLMs on scientific data visualization, with improvements of up to +13.21 points on MatPlotBench.",
    379       "evidence": "Table 1 shows improvements for 6 of 7 models tested. GPT-4: +12.30 (48.86→61.16), GPT-3.5: +9.48 (38.03→47.51), Magicoder: +13.21 (38.49→51.70). CodeLlama-34B is the exception at -2.36.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "The visual feedback mechanism is the key driver of MatPlotAgent's improvements.",
    384       "evidence": "Table 4 ablation: GPT-4 drops from 61.16 to 53.44 without visual feedback (7.72 point loss, 63% of total improvement). Table 3 on Qwen-Agent also shows visual feedback contributes the majority of the gain.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "GPT-4V automatic evaluation strongly correlates with human evaluation on MatPlotBench.",
    389       "evidence": "Section 3.2 reports Pearson r=0.876, p=7.41e-33 for GPT-4 and r=0.836, p=2.67e-27 for GPT-3.5, computed over k=100 random subsets of n=25 examples.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "MatPlotAgent is model-agnostic and works with different visual agents.",
    394       "evidence": "Table 2 shows improvements with Gemini Pro Vision as visual agent: GPT-4 +7.87, GPT-3.5 +5.45. However, only two visual agents are tested.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Zero-shot chain-of-thought prompting generally hurts performance on scientific data visualization.",
    399       "evidence": "Table 1 shows CoT reduces performance for 5 of 6 models tested (GPT-4: -3.44, GPT-3.5: -0.89, Magicoder: -0.54, DeepSeek-6.7B: -2.37, CodeLlama: -4.14). Only DeepSeek-33B improves (+5.22).",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Open-source Magicoder-S-DS-6.7B with MatPlotAgent surpasses GPT-4 with direct decoding.",
    404       "evidence": "Table 1: Magicoder + MatPlotAgent scores 51.70 vs GPT-4 direct decoding at 48.86. However, this is a 2.84 point difference on a 100-point scale with no uncertainty quantification.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "red_flags": [
    409     {
    410       "flag": "No uncertainty quantification on main results",
    411       "detail": "All results in Tables 1-4 are point estimates with no error bars, confidence intervals, or variance measures. With only 100 test examples, the standard error of the mean could be substantial. The 2.84-point difference between Magicoder+MatPlotAgent (51.70) and GPT-4 direct decoding (48.86) may not be statistically significant."
    412     },
    413     {
    414       "flag": "Evaluator-model overlap",
    415       "detail": "GPT-4V is used as both the automatic evaluator and as a component of the evaluation pipeline (for both scoring and as a visual agent). GPT-4V is closely related to GPT-4, one of the models being evaluated. This creates a potential bias where GPT-4-generated outputs might be rated more favorably by the GPT-4V evaluator."
    416     },
    417     {
    418       "flag": "No cost analysis despite multi-call framework",
    419       "detail": "MatPlotAgent makes 4+ LLM calls per example (query expansion + code generation + up to 3 debugging rounds + visual feedback with additional code refinement), including expensive multi-modal model calls. Comparing its scores to direct decoding without accounting for the massive compute difference makes the improvements misleading."
    420     },
    421     {
    422       "flag": "Small benchmark size",
    423       "detail": "MatPlotBench contains only 100 examples. Given the diversity of scientific visualization types, this may not provide robust estimates of model performance across the claimed domain. Per-category analysis is not provided, making it impossible to assess where the 100 examples are concentrated."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Evaluating large language models trained on code",
    429       "authors": ["Mark Chen", "Jerry Tworek"],
    430       "year": 2021,
    431       "relevance": "Introduced Codex and the HumanEval benchmark, foundational work for code generation evaluation."
    432     },
    433     {
    434       "title": "GPT-4 technical report",
    435       "authors": ["OpenAI"],
    436       "year": 2023,
    437       "relevance": "Technical report for GPT-4, the primary commercial model evaluated in this paper's experiments."
    438     },
    439     {
    440       "title": "Code Llama: Open foundation models for code",
    441       "authors": ["Baptiste Rozière", "Jonas Gehring"],
    442       "year": 2024,
    443       "relevance": "Open-source code LLM series evaluated as baselines in this paper's experiments."
    444     },
    445     {
    446       "title": "DeepSeek-Coder: When the large language model meets programming",
    447       "authors": ["Daya Guo", "Qihao Zhu"],
    448       "year": 2024,
    449       "relevance": "Open-source code model series evaluated as baselines, demonstrating performance across model sizes."
    450     },
    451     {
    452       "title": "Teaching large language models to self-debug",
    453       "authors": ["Xinyun Chen", "Maxwell Lin"],
    454       "year": 2024,
    455       "relevance": "Self-debugging mechanism adopted in MatPlotAgent's code agent module for iterative error correction."
    456     },
    457     {
    458       "title": "Communicative agents for software development",
    459       "authors": ["Chen Qian", "Xin Cong"],
    460       "year": 2023,
    461       "relevance": "ChatDev LLM agent framework for software development, related agentic approach in the coding domain."
    462     },
    463     {
    464       "title": "AgentBench: Evaluating LLMs as agents",
    465       "authors": ["Xiao Liu", "Hao Yu"],
    466       "year": 2024,
    467       "relevance": "Benchmark for evaluating LLMs as autonomous agents across diverse environments."
    468     },
    469     {
    470       "title": "Magicoder: Source code is all you need",
    471       "authors": ["Yuxiang Wei", "Zhe Wang"],
    472       "year": 2023,
    473       "relevance": "Open-source code LLM achieving strong performance, notably surpassing GPT-4 when augmented with MatPlotAgent."
    474     },
    475     {
    476       "title": "ReAct: Synergizing reasoning and acting in language models",
    477       "authors": ["Shunyu Yao", "Jeffrey Zhao"],
    478       "year": 2023,
    479       "relevance": "Foundational LLM agent framework combining reasoning and action, influential in agentic AI design."
    480     },
    481     {
    482       "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    483       "authors": ["Yujia Qin", "Shihao Liang"],
    484       "year": 2024,
    485       "relevance": "Framework for LLM tool use at scale, relevant to agentic AI tool utilization research."
    486     },
    487     {
    488       "title": "Reflexion: Language agents with verbal reinforcement learning",
    489       "authors": ["Noah Shinn", "Federico Cassano"],
    490       "year": 2023,
    491       "relevance": "LLM agent framework using verbal feedback for self-improvement, related to the visual feedback mechanism."
    492     },
    493     {
    494       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    495       "authors": ["Yuhang Lai", "Chengxi Li"],
    496       "year": 2023,
    497       "relevance": "Prior benchmark for data science code generation including Matplotlib problems, discussed as insufficient for complex visualization."
    498     }
    499   ]
    500 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs