scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24712B)
      1 {
      2   "paper": {
      3     "title": "A Survey on Progress in LLM Alignment from the Perspective of Reward Design",
      4     "authors": [
      5       "Miaomiao Ji",
      6       "Yanqiu Wu",
      7       "Zhibin Wu",
      8       "Shoujin Wang",
      9       "Jian Yang",
     10       "Mark Dras",
     11       "Usman Naseem"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2505.02666",
     16     "doi": "10.48550/arXiv.2505.02666"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["survey_methodology"],
     20   "methodology_tags": ["meta-analysis"],
     21   "key_findings": "This survey taxonomizes reward modeling for LLM alignment along three dimensions: mathematical formulation, construction practices (rule-based, data-driven, hybrid), and functional roles under optimization paradigms (RL-based, ICL-based, DPO/implicit). The paper identifies two major paradigm shifts: from RL-based to RL-free alignment methods, and from single-task to multi-objective/multi-modal reward design. It is a narrative review without systematic methodology or quality assessment of its sources.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No code repository, analysis scripts, or supplementary materials are mentioned anywhere in the paper."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No dataset, paper corpus, or extracted data tables are released. The survey does not provide a downloadable list of reviewed papers or structured extraction results."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No environment specifications are provided. As a survey, analysis tools or scripts could have been specified but were not."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No reproduction instructions are provided. The survey does not describe a reproducible search or analysis protocol that others could follow."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "This is a narrative survey paper that does not run experiments or report quantitative results requiring confidence intervals."
     50       },
     51       "significance_tests": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No experiments are conducted; no statistical comparisons are made. The paper is a literature review."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No experiments are conducted; no effect sizes apply to this narrative survey."
     60       },
     61       "sample_size_justified": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "No experiments or data collection requiring sample size justification. This is a literature survey."
     65       },
     66       "variance_reported": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No experimental runs are conducted. The paper is a narrative literature review."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper references prior surveys in the Motivation section (refs [26–30]) but does not systematically compare its coverage, methodology, or contributions against them. There is no structured comparison table showing how this survey differs from or improves upon prior reviews."
     77       },
     78       "baselines_contemporary": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No experiments requiring baseline comparisons. This is a survey paper."
     82       },
     83       "ablation_study": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No system with components to ablate. This is a survey paper."
     87       },
     88       "multiple_metrics": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No experiments with metrics. This is a survey paper."
     92       },
     93       "human_evaluation": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No system outputs to evaluate. This is a survey paper."
     97       },
     98       "held_out_test_set": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "No experiments requiring test sets. This is a survey paper."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper provides detailed per-category breakdowns of reward modeling approaches: Table 1 contrasts rule-based, data-driven, and hybrid RMs across multiple dimensions (construction approach, strengths, limitations, typical examples, best-suited scenarios). Table 2 classifies 34 papers across feedback source, optimization paradigm, and RM characteristics."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper extensively discusses failure modes and limitations of different reward design approaches: reward hacking (Section 1.1), reward misspecification (Section 1.2), mode collapse, annotation bias, PPO instability (Section 5.1.1), and limitations of each RM construction paradigm (Table 1)."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports negative findings from the literature: PPO's instability and sample inefficiency (Section 5.1.1), reward model overoptimization (ref [22]), hand-engineered metrics conflicting with each other (Section 4.2), and systematic failures of single-value reward modeling (Section 3.1.4)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims the survey provides 'a structured organization of reward modeling' addressing 'mathematical formulation, construction practices, and interaction with optimization paradigms' with a 'macro-level taxonomy.' Sections 3–5 deliver these three aspects, and Figure 1 provides the taxonomy. The abstract also claims coverage of paradigm shifts from RL-based to RL-free, which Section 6 addresses."
    124       },
    125       "causal_claims_justified": {
    126         "applies": false,
    127         "answer": false,
    128         "justification": "The paper is a narrative survey that makes descriptive/taxonomic claims about the field's evolution rather than testable causal claims from its own empirical analysis."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper makes very broad claims about 'the evolution of LLM alignment' and 'the progression of LLM alignment' without bounding its coverage to specific sub-fields, time periods, or noting what areas of reward design it does not cover. The title and abstract frame the scope broadly without explicit boundaries."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "This is a pure survey/taxonomy paper that presents no empirical results of its own requiring alternative explanations."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "This is a survey paper with no measurements; no proxy-outcome gap can arise from its own methodology."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No models are used in this survey. It reviews models discussed in the literature."
    151       },
    152       "prompts_provided": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No prompting is used. This is a literature survey."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No experiments are conducted. This is a literature survey."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. This is a literature survey."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not describe any paper selection pipeline: no search queries, no databases searched, no inclusion/exclusion criteria, no screening stages. It is unclear how the ~130 cited papers were identified and selected for inclusion."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "There is no dedicated limitations section. Section 6 ('Discussion') covers future directions but does not discuss limitations of the survey itself. Section 7 ('Conclusions') is a brief paragraph without self-critical reflection."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed. The paper does not address potential biases in its literature selection, coverage gaps, or limitations of its taxonomic framework."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not explicitly state what it does NOT cover. There are no statements about excluded topics, time boundaries for the literature search, or types of reward design approaches that fall outside the review's scope."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw data is available. The complete list of papers reviewed, search results, or extracted classification data is not provided in any downloadable form."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The paper does not describe how papers were collected. There is no mention of databases searched, search queries used, date ranges, or how the corpus of reviewed work was assembled."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "For a survey, the 'recruitment' of papers requires describing the search and selection strategy. No such description is provided — it is unclear how the reviewed papers were identified and selected from the broader literature."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No data pipeline is documented. There is no description of stages from initial literature search to final inclusion, no PRISMA-style flow, and no accounting of how many papers were screened vs. included."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information or acknowledgments section appears in the paper text provided."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: Macquarie University (Australia), Sichuan University (China), and University of Technology Sydney (Australia). None of the authors appear affiliated with companies whose products are being evaluated."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of a funding statement leaves this unknown."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement appears in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this literature survey."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this literature survey."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this literature survey."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this literature survey."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this literature survey."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this literature survey."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this literature survey."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "This is a survey paper with no method of its own to cost."
    293       },
    294       "compute_budget_stated": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "This is a survey paper with no computational experiments."
    298       }
    299     },
    300     "survey_methodology": {
    301       "prisma_or_structured_protocol": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper follows no PRISMA or structured review protocol. There is no flow diagram, no registered protocol, no reproducible search strategy, and no systematic inclusion/exclusion criteria. It is a narrative review with ad-hoc paper collection."
    305       },
    306       "quality_assessment_of_sources": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The survey does not assess the methodological quality of its source papers. All cited works are treated equally regardless of their rigor — no quality scoring rubric, risk-of-bias assessment, or evaluation of study design strength is applied."
    310       },
    311       "publication_bias_discussed": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No discussion of publication bias. The survey does not consider whether its sources skew toward positive results, whether negative findings about reward modeling are underrepresented, or whether the reviewed literature has selection effects."
    315       }
    316     }
    317   },
    318   "claims": [
    319     {
    320       "claim": "Reward design is the central connective tissue in the LLM alignment pipeline, bridging feedback collection and optimization.",
    321       "evidence": "Argued throughout Section 1.2 and Section 2 via a conceptual framework (Figure 2) positioning reward design as the 'prescription' stage between feedback ('diagnosis') and optimization ('treatment').",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "The evolution of LLM alignment can be understood as a continuous refinement of reward design strategies.",
    326       "evidence": "Supported narratively through the progression described in Sections 3-6, covering the shift from rule-based to data-driven to hybrid reward modeling, and from explicit to implicit RMs. Table 1 in Section 6 maps 34 papers across these dimensions.",
    327       "supported": "moderate"
    328     },
    329     {
    330       "claim": "There has been a paradigm shift from RL-based to RL-free optimization methods in LLM alignment.",
    331       "evidence": "Section 5.3 describes DPO and variants as RL-free alternatives. Section 6.1 asserts this trend. Supported by citations to DPO (ref [70]), SFT-based methods (refs [99-101]), and various DPO variants.",
    332       "supported": "moderate"
    333     },
    334     {
    335       "claim": "Hybrid reward models that integrate multiple signal sources offer superior robustness and generalization in complex tasks.",
    336       "evidence": "Section 4.2 reviews hybrid approaches across multi-source, multi-granularity, multi-modal, and multi-aspect dimensions. Individual cited papers report improvements, but the survey does not independently verify or meta-analyze these claims.",
    337       "supported": "weak"
    338     },
    339     {
    340       "claim": "Token-level reward modeling provides finer credit assignment and more stable training compared to response-level approaches.",
    341       "evidence": "Section 3.1.3 reviews token-level methods (TLCR ref [34], TDPO ref [36], TLDR ref [37], Q-RM ref [38]) and asserts these benefits, citing individual papers' results without independent verification or meta-analysis.",
    342       "supported": "weak"
    343     }
    344   ],
    345   "red_flags": [
    346     {
    347       "flag": "No systematic review methodology",
    348       "detail": "The survey follows no PRISMA or structured protocol. There is no documented search strategy, no stated databases, no inclusion/exclusion criteria, and no PRISMA flow diagram. The paper collection appears entirely ad-hoc, making it impossible to assess completeness or selection bias."
    349     },
    350     {
    351       "flag": "No quality assessment of reviewed studies",
    352       "detail": "All cited papers are treated as equally valid regardless of their methodological rigor. A paper reporting PPO improvements with careful ablation studies is treated identically to a preprint with only a single-run comparison. This launders the signal-to-noise ratio of the sources."
    353     },
    354     {
    355       "flag": "No limitations section",
    356       "detail": "The survey contains no self-critical discussion of its own methodology, coverage gaps, or potential biases. Section 6 discusses future directions for the field but not limitations of the survey itself."
    357     },
    358     {
    359       "flag": "Broad claims without bounded scope",
    360       "detail": "The paper claims to capture 'the evolution of LLM alignment' and 'paradigm shifts' without specifying what time period, sub-fields, or venues were covered. No boundaries on what falls outside the review's scope are stated."
    361     },
    362     {
    363       "flag": "Uncritical aggregation of results",
    364       "detail": "Claims like 'hybrid reward models offer superior robustness' and 'token-level rewards provide more stable training' are stated as general conclusions drawn from individual papers' self-reported improvements, without independent verification or consideration of contradictory evidence."
    365     }
    366   ],
    367   "cited_papers": [
    368     {
    369       "title": "Training language models to follow instructions with human feedback",
    370       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    371       "year": 2022,
    372       "arxiv_id": "2203.02155",
    373       "relevance": "InstructGPT — foundational RLHF paper demonstrating instruction-following alignment with human feedback, central to understanding reward modeling pipelines."
    374     },
    375     {
    376       "title": "Direct preference optimization: Your language model is secretly a reward model",
    377       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    378       "year": 2023,
    379       "arxiv_id": "2305.18290",
    380       "relevance": "DPO — key RL-free alignment method that implicitly encodes reward through preference optimization, representing a paradigm shift in reward design."
    381     },
    382     {
    383       "title": "Deep reinforcement learning from human preferences",
    384       "authors": ["Paul F. Christiano", "Jan Leike", "Tom Brown"],
    385       "year": 2017,
    386       "relevance": "Foundational RLHF work introducing reward modeling from human preference comparisons, establishing the framework for LLM alignment research."
    387     },
    388     {
    389       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    390       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    391       "year": 2022,
    392       "arxiv_id": "2204.05862",
    393       "relevance": "Anthropic's RLHF methodology for helpful and harmless AI assistants, demonstrating multi-objective reward design in practice."
    394     },
    395     {
    396       "title": "Constitutional AI: Harmlessness from AI feedback",
    397       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    398       "year": 2022,
    399       "arxiv_id": "2212.08073",
    400       "relevance": "Introduces rule-based constitutional principles combined with AI feedback for alignment, a key hybrid reward design approach."
    401     },
    402     {
    403       "title": "Scaling laws for reward model overoptimization",
    404       "authors": ["Leo Gao", "John Schulman", "Jacob Hilton"],
    405       "year": 2023,
    406       "relevance": "Empirically characterizes reward hacking and overoptimization in RLHF, directly relevant to understanding reward model limitations."
    407     },
    408     {
    409       "title": "Rule based rewards for language model safety",
    410       "authors": ["Tong Mu", "Alec Helyar", "Johannes Heidecke"],
    411       "year": 2024,
    412       "arxiv_id": "2411.01111",
    413       "relevance": "Proposes rule-based reward framework combining behavioral rules with LLM verification for safety alignment without preference data."
    414     },
    415     {
    416       "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models",
    417       "authors": ["Zhihong Shao", "Peiyi Wang", "Qihao Zhu"],
    418       "year": 2024,
    419       "arxiv_id": "2402.03300",
    420       "relevance": "Introduces Group Relative Policy Optimization (GRPO), a critic-free RL alternative replacing absolute rewards with group-based relative feedback."
    421     },
    422     {
    423       "title": "Let's verify step by step",
    424       "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yura Burda"],
    425       "year": 2023,
    426       "arxiv_id": "2305.20050",
    427       "relevance": "Introduces stepwise process rewards for reasoning, demonstrating fine-grained reward design for chain-of-thought alignment."
    428     },
    429     {
    430       "title": "Self-refine: Iterative refinement with self-feedback",
    431       "authors": ["Aman Madaan", "Ximing Lu", "Yao Fu"],
    432       "year": 2023,
    433       "arxiv_id": "2303.17651",
    434       "relevance": "Demonstrates non-numerical reward via stepwise self-feedback during generation, an alternative to scalar reward models."
    435     },
    436     {
    437       "title": "Defining and characterizing reward hacking",
    438       "authors": ["Joar Skalse", "Nikolaus H.R. Howe", "Dmitrii Krasheninnikov"],
    439       "year": 2022,
    440       "arxiv_id": "2209.13085",
    441       "relevance": "Formalizes reward hacking — a core failure mode in reward-based alignment that motivates much of the hybrid reward design research."
    442     },
    443     {
    444       "title": "Safe RLHF: Safe reinforcement learning from human feedback",
    445       "authors": ["Josef Dai", "Xuehai Pan", "Ruiyang Sun"],
    446       "year": 2023,
    447       "arxiv_id": "2310.12773",
    448       "relevance": "Addresses safety-aware RLHF with constrained reward optimization, relevant to multi-objective reward design for AI safety."
    449     }
    450   ],
    451   "engagement_factors": {
    452     "practical_relevance": {
    453       "score": 1,
    454       "justification": "Survey provides conceptual taxonomy useful for researchers but no directly usable tool or technique for practitioners."
    455     },
    456     "surprise_contrarian": {
    457       "score": 0,
    458       "justification": "Confirms well-known narratives about the RL-to-RL-free shift and the importance of reward design without challenging any conventional wisdom."
    459     },
    460     "fear_safety": {
    461       "score": 1,
    462       "justification": "Discusses alignment failures, reward hacking, and safety concerns but as a review of known issues rather than revealing new risks."
    463     },
    464     "drama_conflict": {
    465       "score": 0,
    466       "justification": "No controversy, no criticism of specific labs or methods, purely descriptive taxonomy."
    467     },
    468     "demo_ability": {
    469       "score": 0,
    470       "justification": "Pure survey paper with no code, demo, or tool to try."
    471     },
    472     "brand_recognition": {
    473       "score": 0,
    474       "justification": "From Macquarie University and Sichuan University — not major AI labs or recognizable brands."
    475     }
    476   }
    477 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs