scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26847B)
      1 {
      2   "paper": {
      3     "title": "Code Aesthetics with Agentic Reward Feedback",
      4     "authors": [
      5       "Bang Xiao",
      6       "Lingjie Jiang",
      7       "Shaohan Huang",
      8       "Tengchao Lv",
      9       "Yupan Huang",
     10       "Xun Wu",
     11       "Lei Cui",
     12       "Furu Wei"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2510.23272"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "A project page URL is provided: https://bangx7.github.io/code-aesthetics (line 60). This serves as a project page. However, this is a project page link, not a direct code repository URL. Given that the project page is provided and typically hosts code/data links, this counts as YES."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper describes the AesCode-358K dataset and OpenDesign benchmark but provides only a project page URL. No direct download link or repository for the dataset is provided in the paper text itself. The dataset release status is not explicitly confirmed."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using LLaMA-Factory, VeRL, playwright, selenium, and specific GPU hardware (8xMI300), but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. Training settings are given in Appendix E (learning rates, batch sizes, epochs), but there are no README-style instructions or runnable scripts described that would allow someone to replicate the full pipeline."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 1 and 2 are reported as point estimates without confidence intervals or error bars. No uncertainty quantification is provided for any metric."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports Spearman's rank correlation (0.98, p < 1.5e-6) and Kendall's rank correlation (0.91, p < 3.0e-5) for the OpenDesign vs Design Arena comparison in Section 6.2. However, no significance tests are reported for the main performance comparisons in Table 1."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports absolute scores with baselines for comparison. For example, AesCoder-4B achieves 81.92 total static score vs. Qwen3-4B-Instruct-2507 base at 73.26 (Table 1), and improvement from SFT to GRPO-AR is shown in Table 2 (e.g., 28.50 to 30.42 Align score). Baseline context is sufficient to assess effect magnitude."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification is given for the choice of 840 benchmark cases in OpenDesign, the 200 sampled HTML pairs for human evaluation, or the 100 test cases for the human preference study. No power analysis is discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. Tables 1 and 2 report only single-run point estimates. No mention of multiple runs or seeds."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 1 includes comparisons against multiple proprietary models (GPT-4o-mini, GPT-4o, GPT-4.1, GPT-5, Claude Sonnet 4) and open-source models (Qwen3-Coder-30B, GLM-4-32B, DeepSeek-V3.1, DeepSeek-R1-0528, etc.) as well as the base models before fine-tuning."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include very recent models: GPT-5 (2025), Claude Sonnet 4 (2025), DeepSeek-R1-0528 (2025), Qwen3-Coder-480B (2025). These are clearly contemporary and competitive."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table 2 presents ablations: GRPO-AR with vs. without agentic reward, and comparisons with DPO and RFT alternatives. The 'GRPO-AR w/o Agentic Reward' variant isolates the contribution of the multi-agent reward system."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Multiple metrics are used: for PandasPlotBench, error rate, average score, and good rate; for OpenDesign, alignment score, aesthetics score, structural score, total static score, and interactive aesthetics score."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Human evaluation is conducted in two forms: (1) 10 human evaluators (3 professors, 7 graduate students) annotated 200 HTML pairs for the OpenDesign reliability analysis (Section 6.2), and (2) human preference annotations comparing AesCoder against baselines on 100 test cases (Appendix F)."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The OpenDesign benchmark consists of 840 cases separate from training data. The RL training data comes from WebSight v0.2 (20K samples), explicitly chosen to avoid overlap with AesCode-358K used in Stage I (Section 5.2). PandasPlotBench is also an external benchmark."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "OpenDesign results are broken down by sub-scores (alignment, aesthetics, structural, interactive). Table 4 in Appendix C shows the category distribution of the benchmark. PandasPlotBench results are also broken into error rate, average score, and good rate."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 4.3 (Discussions) explicitly discusses failure cases of the interactive aesthetics agent, including confusing webpage elements and being misled by irrelevant textual content. The case study in Figure 4 also shows comparative outputs where models fail."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 2 shows that the GRPO-AR w/o Agentic Reward ablation sometimes performs worse than SFT alone (e.g., for Qwen2.5-Coder-7B-Instruct, Align drops from 28.85 to 28.81). The paper also acknowledges that GUI agent failures can lead to scores lower than true values (Section 4.3)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims that AesCoder-4B surpasses GPT-4o and GPT-4.1 and achieves performance comparable to 480B-685B parameter models. Table 1 confirms: AesCoder-4B (81.92 total) vs GPT-4o (48.08), GPT-4.1 (65.79), and Qwen3-Coder-480B (79.90). All abstract claims are supported by the results."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims about the agentic reward improving performance. The ablation in Table 2 (GRPO-AR with vs. without agentic reward) provides controlled single-variable manipulation, which is adequate for causal inference within the experimental context."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper's title is 'Code Aesthetics with Agentic Reward Feedback' but experiments are limited to two specific domains: Python matplotlib/seaborn/plotly plots and HTML webpage design. The paper does not explicitly bound its claims to these domains, and phrases like 'code aesthetics' in the title suggest broader applicability than demonstrated."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for the results. For example, it does not consider whether improvements could be due to the additional training data rather than the agentic reward specifically, or whether GPT-5 as a judge systematically favors outputs trained with GPT-5-generated data. No threats-to-validity or alternative explanations section is present."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model versions are provided: Qwen3-4B-Instruct-2507, Qwen2.5-Coder-7B-Instruct, Qwen3-Coder-480B-A35B-Instruct-FP8, GPT-4o, GPT-5 (minimal). While some models lack exact API snapshot dates (e.g., GPT-4o, GPT-5), the specific model identifiers used (e.g., Qwen3-4B-Instruct-2507 with date suffix) show versioning effort."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompt templates are provided in Appendices H.1-H.7 for the pairwise evaluation, pointwise evaluation, interactive aesthetics agent, ablation variant, keyword generation, RL data rewriting, and execution agent validation rules."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix E provides detailed hyperparameters: SFT uses AdamW with 1e-5 max LR, batch size 128, 3 epochs, 10% warmup; RL uses 3e-6 LR, batch size 64, KL coefficient 0.001, epsilon 0.5, group size G=8. Reward weights are wexec=0.1, wstatic=0.8, winteract=0.1."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The three-agent agentic reward framework is described in detail in Section 4: execution agent (HTML validation), static aesthetics agent (screenshot-based GPT-5 evaluation), and interactive aesthetics agent (WebVoyager-based GUI interaction). Agent planning, interacting, and scoring mechanisms are all described."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3 documents the data construction pipeline: for plots, code regeneration from VisCode-200K with quality control via library restriction and runtime validation (200K→158K). For webpages, a four-step process (keyword generation → instruction generation → embedding/clustering dedup → code generation with quality filtering) is documented with counts at each stage."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The closest is a brief 'Discussions' paragraph in Section 4.3 about GUI agent failure modes, but this is not a proper limitations section."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed. The paper does not address concerns such as: the circular nature of using GPT-5 both to generate training data and as the evaluation judge, the limited domain coverage (only plots and webpages), or potential overfitting to the evaluation metric."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. No statements about what domains, tasks, or settings are excluded from the claims. The 'code aesthetics' framing in the title implies broader scope than the experiments support."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Neither the raw training data (AesCode-358K) nor the OpenDesign benchmark cases are made available for independent verification in the paper. Only a project page URL is provided without confirmed data downloads."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3 describes the data collection in detail: plot data adapted from VisCode-200K with Qwen3-Coder regeneration; webpage data created via GPT-4o keyword generation, instruction generation, embedding-based dedup, and GPT-5/Qwen3-Coder HTML generation with quality filtering."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "For the human evaluation (10 humans: 3 professors, 7 graduate students), no description is given of how they were recruited, what institution they are from (beyond affiliations), whether they had relevant expertise, or whether this introduces selection bias."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data pipeline is documented with counts at key stages: VisCode-200K → 158K after quality filtering for plots; 400K instructions → 200K after K-Means dedup for webpages; 20K RL data from WebSight v0.2 with GPT-4o rewriting for the RL stage."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source or acknowledgments section is present in the paper. All authors are affiliated with Microsoft Research Asia and partner universities, but no funding disclosure is provided."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Microsoft Research Asia, Zhiyuan College at Shanghai Jiao Tong University, and Peking University."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "All first and senior authors are from Microsoft Research Asia. While the paper does not directly evaluate a Microsoft product, the authors' employer is a major AI company with competing products. No discussion of whether the funder (Microsoft) has a stake in the outcome."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper evaluates multiple LLMs on the OpenDesign and PandasPlotBench benchmarks but does not state the training data cutoff for any of the models evaluated (GPT-4o, GPT-5, Claude Sonnet 4, etc.)."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the benchmark data (OpenDesign cases, PandasPlotBench) could overlap with the training data of the evaluated models. The RL data is explicitly separated from SFT data, but broader contamination from pre-training is not addressed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "OpenDesign is a new benchmark, which partially mitigates contamination. However, PandasPlotBench is an existing benchmark that could be in the training data of the proprietary models. This is not discussed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The paper includes human evaluation studies (Section 6.2 and Appendix F) but no pre-registration is mentioned."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No IRB or ethics board approval is mentioned for the human evaluation studies involving 10 human evaluators."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "The human evaluators are described only as '3 professors, 7 graduate students.' No further demographics (field of expertise, experience level, gender, geographic distribution) are reported."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No inclusion or exclusion criteria are stated for the selection of the 10 human evaluators."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "The human evaluation is pairwise comparison rating, not a randomized experimental study with treatment/control conditions. Randomization of assignment to conditions is not applicable."
    267       },
    268       "blinding_described": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No description of whether human evaluators knew which model generated which output. Blinding would be relevant for the pairwise comparison study to prevent bias toward known models."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No information about evaluator dropout or whether all 10 evaluators completed all 200 comparisons. The paper states 2,000 annotations from 200 pairs suggesting full completion, but this is not explicitly confirmed."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost or latency is reported for the AesCoder models or the agentic reward framework. The framework involves multiple API calls (GPT-5 for static scoring, GPT-4o for interactive scoring) and browser automation, but costs are not quantified."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix E states: SFT phase takes approximately 2 days on 1 node of 8xMI300 GPUs for the 7B model. RL phase takes approximately 7 days on 1 node of 8xMI300 GPUs. This provides a reasonable picture of compute requirements."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "AesCoder-4B surpasses GPT-4o and GPT-4.1 on both PandasPlotBench and OpenDesign benchmarks.",
    295       "evidence": "Table 1: AesCoder-4B achieves 81.92 total static score and 1.04 interactive score on OpenDesign vs. GPT-4o (48.08/0.44) and GPT-4.1 (65.79/0.74). On PandasPlotBench, AesCoder-4B achieves 0.09 error rate and 70 avg score vs. GPT-4o (0.09/68) and GPT-4.1 (0.09/69).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "AesCoder-4B achieves performance comparable to large open-source models with 480B-685B parameters.",
    300       "evidence": "Table 1: AesCoder-4B (4B params) achieves 81.92 total static score vs. Qwen3-Coder-480B (79.90) and DeepSeek-V3.1/685B (77.72). Interactive score: AesCoder-4B (1.04) vs. Qwen3-Coder-480B (0.70) and DeepSeek-V3.1 (0.88).",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "OpenDesign closely reflects large-scale human judgment, achieving Spearman's correlation of 0.98 with Design Arena.",
    305       "evidence": "Section 6.2: Spearman = 0.98 (p < 1.5e-6) and Kendall = 0.91 (p < 3.0e-5) between OpenDesign and Design Arena rankings across 10 models. 66.7% top-3 and 80.0% top-5 overlap with Design Arena.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Agentic reward feedback is essential for achieving the improvements; removing it degrades performance.",
    310       "evidence": "Table 2: For Qwen3-4B, GRPO-AR with agentic reward achieves 30.42/26.19/25.31/1.04 vs. without agentic reward 29.16/25.20/24.67/0.71 across Align/Aes/Struct/InterAes. Similar pattern for Qwen2.5-Coder-7B.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "LLM-as-a-Judge (GPT) achieves higher agreement with human evaluators (80.9%) than humans agree with each other (68.7%).",
    315       "evidence": "Section 6.2 and Figure 3b: GPT-human agreement = 80.9%, human-human agreement = 68.7% across 2,000 annotations from 200 HTML pairs.",
    316       "supported": "moderate"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "The paper introduces a multi-agent reward framework (GRPO-AR) for training LLMs on 'code aesthetics' tasks (plot generation and webpage design). The framework uses three agents to evaluate code executability, static visual aesthetics (via screenshots), and interactive aesthetics (via GUI agent). AesCoder-4B, trained with SFT on AesCode-358K and RL with GRPO-AR, achieves performance on OpenDesign and PandasPlotBench that matches or exceeds models 100x larger (480B-685B parameters) and outperforms GPT-4o and GPT-4.1. The OpenDesign benchmark of 840 webpage cases shows strong correlation (Spearman=0.98) with human judgments from Design Arena.",
    323   "red_flags": [
    324     {
    325       "flag": "Circular evaluation design",
    326       "detail": "GPT-5 is used both to generate part of the training data (Section 3.2: GPT-5 generates HTML code) and as the judge in the static aesthetics evaluation (Section 4.2: GPT-5 is the judge). This creates a circularity where the evaluation metric may systematically favor outputs that resemble GPT-5's own generation style."
    327     },
    328     {
    329       "flag": "No variance or error bars",
    330       "detail": "All experimental results are reported as single point estimates without any variance, standard deviation, confidence intervals, or multiple-run statistics. It is impossible to assess the stability or reliability of the reported numbers."
    331     },
    332     {
    333       "flag": "No limitations section",
    334       "detail": "The paper has no dedicated limitations or threats-to-validity section. Key concerns — circular evaluation, limited domain scope (only plots and webpages), potential data contamination in proprietary model comparisons, and the reliability of GUI agents as evaluators — are not systematically discussed."
    335     },
    336     {
    337       "flag": "Microsoft employees evaluating with Microsoft-relevant models",
    338       "detail": "All primary authors are from Microsoft Research Asia. While the paper does not evaluate a Microsoft product directly, the competitive landscape (their model outperforming GPT-4o, GPT-4.1, etc.) has commercial implications. No conflict of interest statement is provided."
    339     },
    340     {
    341       "flag": "Judge model used for both training data quality filtering and evaluation",
    342       "detail": "GPT-5 scores webpage screenshots during training data construction (Section 3.2: 'asked GPT-5 to score the two outputs') and the same model serves as the OpenDesign static aesthetics judge. The training data was optimized to score well with GPT-5, potentially inflating benchmark scores."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "Drawing Pandas: A Benchmark for LLMs in Generating Plotting Code",
    348       "authors": ["Timur Galimzyanov", "Sergey Titov", "Yaroslav Golubev", "Egor Bogomolov"],
    349       "year": 2025,
    350       "relevance": "Benchmark for evaluating LLM code generation for data visualization, directly used as evaluation in this paper."
    351     },
    352     {
    353       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    354       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    355       "year": 2023,
    356       "relevance": "Foundational work on LLM-as-a-judge evaluation methodology used throughout this paper's evaluation design."
    357     },
    358     {
    359       "title": "RLEF: Grounding Code LLMs in Execution Feedback with Reinforcement Learning",
    360       "authors": ["Jonas Gehring", "Kunhao Zheng", "Jade Copet"],
    361       "year": 2024,
    362       "relevance": "Prior work on execution-based reward signals for code LLM training, a baseline approach this paper extends."
    363     },
    364     {
    365       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    366       "authors": ["Zhihong Shao", "Peiyi Wang", "Qihao Zhu"],
    367       "year": 2024,
    368       "relevance": "Introduces GRPO algorithm that forms the basis of the GRPO-AR method proposed in this paper."
    369     },
    370     {
    371       "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    372       "authors": ["Hongliang He", "Wenlin Yao", "Kaixin Ma"],
    373       "year": 2024,
    374       "relevance": "Web agent framework adopted for the interactive aesthetics evaluation agent in this paper."
    375     },
    376     {
    377       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    378       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    379       "year": 2024,
    380       "relevance": "RLHF baseline method compared against GRPO-AR in the ablation study."
    381     },
    382     {
    383       "title": "Agentic Reward Modeling: Integrating Human Preferences with Verifiable Correctness Signals for Reliable Reward Systems",
    384       "authors": ["Hao Peng", "Yunjia Qi", "Xiaozhi Wang"],
    385       "year": 2025,
    386       "relevance": "Related work on combining agent-based verification with human preference reward signals for LLM training."
    387     },
    388     {
    389       "title": "Why Do Multi-Agent LLM Systems Fail?",
    390       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    391       "year": 2025,
    392       "relevance": "Study of multi-agent LLM failure modes, relevant to understanding limitations of the agentic reward framework."
    393     },
    394     {
    395       "title": "SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-Training",
    396       "authors": ["Tianzhe Chu", "Yuexiang Zhai", "Jihan Yang"],
    397       "year": 2025,
    398       "relevance": "Provides theoretical motivation for the paper's two-stage SFT-then-RL training approach."
    399     },
    400     {
    401       "title": "A Survey on LLM-as-a-Judge",
    402       "authors": ["Jiawei Gu", "Xuhui Jiang", "Zhichao Shi"],
    403       "year": 2025,
    404       "relevance": "Survey of LLM-as-a-judge methodology directly relevant to the evaluation approach used in this paper."
    405     },
    406     {
    407       "title": "VisCoder: Fine-Tuning LLMs for Executable Python Visualization Code Generation",
    408       "authors": ["Yuansheng Ni", "Ping Nie", "Kai Zou"],
    409       "year": 2025,
    410       "relevance": "Source of the VisCode-200K dataset that the paper adapts for its plot generation training data."
    411     },
    412     {
    413       "title": "Training Language Models to Follow Instructions with Human Feedback",
    414       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    415       "year": 2022,
    416       "relevance": "Foundational RLHF work that this paper's reward framework builds upon."
    417     }
    418   ]
    419 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs