scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (20129B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "On Evaluating LLM Alignment by Evaluating LLMs as Judges",
      6     "authors": [
      7       "Yixin Liu",
      8       "Pengfei Liu",
      9       "Arman Cohan"
     10     ],
     11     "year": 2025,
     12     "venue": "NeurIPS 2025",
     13     "arxiv_id": "2511.20604",
     14     "doi": "10.48550/arXiv.2511.20604"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims ALIGNEVAL 'matches or surpasses' AlpacaEval and Arena-Hard. Table 4 supports this: ALIGNEVAL-CLAUDE (0.885) > Arena-Hard-SC (0.882) and >> AlpacaEval-LC (0.746). The 0.94 combined correlation with IFEval is supported by Table 4 (0.946).",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper is careful to use correlational language ('strong correlation', 'consistency', 'suggests'). The main claim is about correlation between generation and evaluation rankings, and the study design (Spearman's rank correlation) is appropriate for this correlational claim. Ablation of filtering (Table 1) uses controlled manipulation.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Claims are generally bounded to the tested settings. The paper specifies results on specific instruction sets (Arena-Hard, AlpacaEval, WildBench), specific oracles (GPT-4o, 15 other LLMs), and specific model sets. §5 frames ALIGNEVAL as a 'proxy evaluation by design.'",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "§3.2.2 discusses why AlpacaEval shows lower GE-consistency (more open-ended instructions). §4.3 discusses self-preference bias. §5 considers adversarial gaming. §4.2 acknowledges ChatBot Arena is 'not a true gold standard' with potential biases, citing Singh et al.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "§5 explicitly states ALIGNEVAL is 'a proxy evaluation by design' and discusses the gap: evaluating LLMs as judges is a proxy for their alignment capability. The paper acknowledges this could be gamed by fine-tuning a model to be a good judge without improving generation.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "§5 'Discussion and Conclusion' contains substantive limitations discussion including adversarial vulnerability, proxy evaluation concerns, and the recommendation to combine with IFEval to mitigate risks.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "§5 discusses specific threats: 'fine-tuning an LLM to act as a judge could artificially boost its ALIGNEVAL ranking without meaningfully improving its alignment.' §4.3 identifies specific self-preference bias (ALIGNEVAL-GPT favors GPT, ALIGNEVAL-CLAUDE favors Claude). §4.2 notes ChatBot Arena's 'opaque data collection process and potential biases.'",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly list what settings, populations, or model types the results do NOT generalize to. §5 notes it 'may be vulnerable to adversarial attacks' but does not state specific untested conditions or excluded scenarios.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Acknowledgements state: 'We are grateful for the TPU compute support provided by the Google TRC program and for the OpenAI API credits support provided by OpenAI's Researcher Access Program.'",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly listed: Yale University and Shanghai Jiao Tong University. These are academic institutions without direct product ties to the evaluated models.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "OpenAI provided API credits and GPT-4o serves as the primary preference oracle (its evaluations define the benchmark's gold standard). Google provided TPU compute and Gemini models rank highest in the evaluation. Both funders have financial interest in their models performing well.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interest declaration is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "\"LLM alignment\" is explicitly defined in footnote 2 as 'general capabilities in following human instructions and providing helpful, high-quality responses'; GE-consistency is formally defined in Eq. 1 with notation.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper states a two-fold contribution explicitly: (1) first comprehensive analysis of GE-consistency across multiple LLMs, and (2) the ALIGNEVAL benchmark for assessing alignment without LLM judges.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "§2 explicitly distinguishes GE-consistency from the related but distinct GV-consistency (Li et al., West et al., Rodriguez et al.) and positions ALIGNEVAL against AlpacaEval, Arena-Hard, MixEval, IFEval, and HelpSteer3 with direct comparative experiments.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "benchmark-creation": {
    118       "construct_design": {
    119         "construct_validity_argued": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper explicitly argues construct validity: because GE-consistency is empirically high (ρ=0.971), measuring evaluation performance serves as a valid proxy for generation alignment — the argument is formal (Eq. 1) and empirically validated across three instruction sets.",
    123           "source": "haiku"
    124         },
    125         "difficulty_distribution_characterized": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The paper notes Arena-Hard contains more challenging technical instructions versus AlpacaEval's open-ended ones, but no explicit difficulty tiers or distribution analysis of benchmark items is provided; filtering removes ~50% of instances without characterizing the remaining distribution.",
    129           "source": "haiku"
    130         },
    131         "ceiling_floor_effects_checked": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Table 3 shows scores range broadly (e.g., llama-3-8b ALIGNEVAL-GPT 6.4% vs gemini-2.0-flash 80.8%), suggesting discrimination, but the paper does not explicitly check for ceiling/floor effects or discuss what score thresholds would indicate benchmark failure.",
    135           "source": "haiku"
    136         },
    137         "human_baseline_included": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "ChatBot Arena provides human-derived LLM rankings as a gold standard for correlation, but no human baseline on the ALIGNEVAL task itself (i.e., how well humans perform at predicting preference oracle judgments) is included.",
    141           "source": "haiku"
    142         },
    143         "scoring_rubric_justified": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Cohen's Kappa is chosen over accuracy with explicit justification that it 'can better reflect model performance when the label distribution is unbalanced'; pairwise comparison with order-swapping is justified by citing prior work on reliability.",
    147           "source": "haiku"
    148         }
    149       },
    150       "robustness": {
    151         "contamination_resistance_designed": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "ALIGNEVAL is built from public Arena-Hard instructions and GPT-4o annotations; no contamination resistance mechanisms (temporal splits, canary strings, dynamic generation) are designed in, and the paper does not discuss whether evaluated models may have been trained on Arena-Hard data.",
    155           "source": "haiku"
    156         },
    157         "temporal_robustness_discussed": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "§4.3 notes that 'all alignment benchmarks show lower correlations with ChatBot Arena than reported at release' due to stronger LLMs, suggesting benchmark degradation, but no update plan or temporal robustness strategy for ALIGNEVAL is proposed.",
    161           "source": "haiku"
    162         },
    163         "failure_modes_discussed": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "§5 explicitly discusses adversarial fine-tuning as a failure mode (models fine-tuned to be good judges without genuine alignment improvement) and self-preference bias as a structural limitation; these are specific, actionable failure modes.",
    167           "source": "haiku"
    168         },
    169         "baseline_implementations_provided": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "The paper provides a GitHub repository (https://github.com/yale-nlp/AlignEval) and the NeurIPS checklist states dataset and codebase will be included in supplemental material with reproduction instructions.",
    173           "source": "haiku"
    174         }
    175       },
    176       "documentation": {
    177         "dataset_documentation_complete": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "§4.1 describes construction methodology (Arena-Hard instances, GPT-4o/Claude-3.7-Sonnet annotations, consistency filtering to 2671 instances), but no formal data card, full preprocessing specification, or inter-annotator statistics for the oracle annotations are provided.",
    181           "source": "haiku"
    182         },
    183         "licensing_and_access_clear": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "A GitHub repository is provided but no license for ALIGNEVAL itself is stated in the paper; the terms under which the benchmark can be used, modified, or redistributed are not specified.",
    187           "source": "haiku"
    188         },
    189         "intended_use_specified": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "§5 specifies the benchmark is for 'benign evaluators, such as model developers' to assess alignment without iterative LLM judge calls, and explicitly warns it should not be used as a sole metric due to adversarial gaming vulnerability.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "LLMs show high generation-evaluation consistency (ρ=0.971 Spearman's) on Arena-Hard with GPT-4o as oracle",
    201       "evidence": "Figure 2 and Table 1 show direct measurement across 15 LLMs; stability confirmed in Appendix D across leave-one-out variants",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Consistency filtering of oracle-inconsistent instances substantially improves GE-consistency measurement",
    206       "evidence": "Table 1 shows correlation rises from 0.793 to 0.971 on Arena-Hard and 0.743 to 0.839 on AlpacaEval with filtering",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "ALIGNEVAL matches or surpasses AlpacaEval and Arena-Hard in correlating with ChatBot Arena human rankings",
    211       "evidence": "Table 4 shows ALIGNEVAL-GPT/CLAUDE at 0.946 (tied with Arena-Hard) and 0.885 vs AlpacaEval-LC 0.746 without IFEval combination",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "ALIGNEVAL combined with IFEval achieves 0.94 Spearman's correlation with ChatBot Arena rankings across 23 LLMs",
    216       "evidence": "Table 4 reports ALIGNEVAL-GPT+IFEval at 0.946 and ALIGNEVAL-CLAUDE+IFEval at 0.946",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "ALIGNEVAL demonstrates self-preference bias, with each variant ranking its oracle's model family higher",
    221       "evidence": "§4.3 observes ALIGNEVAL-GPT ranks gpt-4o-2024-05-13 second while ALIGNEVAL-CLAUDE ranks claude-3.5-sonnet highest, explicitly noted",
    222       "supported": "moderate"
    223     },
    224     {
    225       "claim": "GE-consistency is a general pattern holding across diverse instruction types including open-domain tasks",
    226       "evidence": "Appendix C shows ρ=0.938 on WildBench with its mixed instruction distribution including creative writing and reasoning",
    227       "supported": "moderate"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "benchmark-eval",
    232     "observational"
    233   ],
    234   "key_findings": "LLMs exhibit strong generation-evaluation consistency (GE-consistency): their ranking as generators strongly correlates with their ranking as evaluators when assessed by a strong oracle (GPT-4o on Arena-Hard: ρ=0.971), though this holds only under specific conditions — strong oracle, challenging instructions, and consistency filtering. Exploiting this finding, ALIGNEVAL evaluates LLM alignment by measuring evaluation performance on fixed preference-annotated instances, achieving ρ=0.946 correlation with ChatBot Arena — matching Arena-Hard while eliminating the need for LLM judges on new model outputs. Combined with IFEval (ALIGNEVAL+), correlation reaches 0.946, outperforming most existing benchmarks at effectively zero inference cost. A key vulnerability is identified: models could be fine-tuned to game ALIGNEVAL without genuinely improving alignment.",
    235   "red_flags": [
    236     {
    237       "flag": "Circular oracle dependency",
    238       "detail": "ALIGNEVAL-GPT is constructed using GPT-4o annotations and validated against ChatBot Arena; OpenAI also provided API credits, creating a circular relationship between funder, primary tool, and benchmark construction."
    239     },
    240     {
    241       "flag": "Self-preference bias unresolved",
    242       "detail": "The paper demonstrates that both ALIGNEVAL variants exhibit oracle self-preference (GPT version favors GPT models, Claude version favors Claude models) but offers no solution beyond 'future work' using multiple oracles."
    243     },
    244     {
    245       "flag": "No contamination analysis",
    246       "detail": "ALIGNEVAL is built from Arena-Hard instructions which are public; evaluated models (especially frontier ones like GPT-4o, Gemini) may have been trained on this data, but no contamination analysis is conducted."
    247     },
    248     {
    249       "flag": "Benchmark degradation documented but unaddressed",
    250       "detail": "§4.3 notes all alignment benchmarks show lower correlations with ChatBot Arena than at release due to stronger LLMs, but proposes no mechanism to update ALIGNEVAL or maintain discriminative validity over time."
    251     },
    252     {
    253       "flag": "Limited model count for benchmark claim",
    254       "detail": "Correlation with ChatBot Arena is measured over only 23 LLMs, a small sample for establishing a benchmark's reliability; rank correlations with n=23 have wide confidence intervals not reported in the main text."
    255     }
    256   ],
    257   "cited_papers": [
    258     {
    259       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    260       "relevance": "Primary gold-standard used to validate ALIGNEVAL; crowdsourced human preference leaderboard against which all benchmarks are compared"
    261     },
    262     {
    263       "title": "AlpacaEval: An Automatic Evaluator of Instruction-Following Models",
    264       "relevance": "Direct baseline benchmark using LLMs-as-judges; ALIGNEVAL is explicitly compared to and claims to match/surpass it"
    265     },
    266     {
    267       "title": "From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline",
    268       "relevance": "Source of evaluation instances for ALIGNEVAL and primary baseline benchmark; ALIGNEVAL uses Arena-Hard instructions"
    269     },
    270     {
    271       "title": "WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild",
    272       "relevance": "Third instruction set used to validate GE-consistency generalization beyond AlpacaEval and Arena-Hard"
    273     },
    274     {
    275       "title": "Instruction-Following Evaluation for Large Language Models (IFEval)",
    276       "relevance": "Complementary benchmark combined with ALIGNEVAL to form ALIGNEVAL+ achieving 0.946 correlation with ChatBot Arena"
    277     },
    278     {
    279       "title": "RewardBench: Evaluating Reward Models for Language Modeling",
    280       "relevance": "Related benchmark for evaluating LLMs as reward models/judges; contextualizes ALIGNEVAL's evaluation paradigm"
    281     },
    282     {
    283       "title": "Benchmarking and Improving Generator-Validator Consistency of Language Models",
    284       "relevance": "Directly related prior work on GV-consistency that ALIGNEVAL builds on and distinguishes from with GE-consistency"
    285     },
    286     {
    287       "title": "The Generative AI Paradox: What It Can Create, It May Not Understand",
    288       "relevance": "Key prior work showing LLMs can have stronger generation than evaluation capabilities, motivating the GE-consistency investigation"
    289     },
    290     {
    291       "title": "MixEval: Deriving Wisdom of the Crowd from LLM Benchmark Mixtures",
    292       "relevance": "Comparison benchmark using ground-truth based evaluation; ALIGNEVAL significantly outperforms it in ChatBot Arena correlation"
    293     },
    294     {
    295       "title": "ReIFE: Re-Evaluating Instruction-Following Evaluation",
    296       "relevance": "Prior work by overlapping authors on evaluating LLM judge reliability, directly related to ALIGNEVAL's evaluation paradigm"
    297     }
    298   ],
    299   "engagement_factors": {
    300     "practical_relevance": {
    301       "score": 2,
    302       "justification": "ALIGNEVAL provides a cost-free benchmark for evaluating LLM alignment without needing LLM judges, useful for researchers and developers building evaluation pipelines."
    303     },
    304     "surprise_contrarian": {
    305       "score": 1,
    306       "justification": "The finding that evaluation capability predicts generation quality is somewhat expected for capable models; the quantification (ρ=0.97) is notable but not shocking."
    307     },
    308     "fear_safety": {
    309       "score": 0,
    310       "justification": "No safety or security concerns raised; the paper is about evaluation methodology."
    311     },
    312     "drama_conflict": {
    313       "score": 1,
    314       "justification": "Mild tension in showing established benchmarks (AlpacaEval, MixEval) perform worse than claimed with newer models, and that a zero-cost alternative can match paid approaches."
    315     },
    316     "demo_ability": {
    317       "score": 2,
    318       "justification": "GitHub repository with benchmark data is publicly available; researchers can run ALIGNEVAL on their models without API costs."
    319     },
    320     "brand_recognition": {
    321       "score": 1,
    322       "justification": "Yale University is well-known but not a tier-1 AI lab. Published at NeurIPS 2025. Uses GPT-4o and Claude as reference points."
    323     }
    324   },
    325   "hn_data": {
    326     "threads": [
    327       {
    328         "hn_id": "46398693",
    329         "title": "Emergent temporal abstractions in autoregressive models enable hierarchical RL",
    330         "points": 2,
    331         "comments": 0,
    332         "url": "https://news.ycombinator.com/item?id=46398693"
    333       },
    334       {
    335         "hn_id": "38252121",
    336         "title": "Fast unfolding of communities in large networks: 15 years later",
    337         "points": 2,
    338         "comments": 0,
    339         "url": "https://news.ycombinator.com/item?id=38252121"
    340       }
    341     ],
    342     "top_points": 2,
    343     "total_points": 4,
    344     "total_comments": 0
    345   }
    346 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs