scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31193B)
      1 {
      2   "paper": {
      3     "title": "On Evaluating LLM Alignment by Evaluating LLMs as Judges",
      4     "authors": [
      5       "Yixin Liu",
      6       "Pengfei Liu",
      7       "Arman Cohan"
      8     ],
      9     "year": 2025,
     10     "venue": "NeurIPS 2025",
     11     "arxiv_id": "2511.20604",
     12     "doi": "10.48550/arXiv.2511.20604"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Footnote 1 states 'ALIGNEVAL is available at https://github.com/yale-nlp/AlignEval' providing a GitHub repository URL."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The ALIGNEVAL benchmark data (2671 instances) is released via the GitHub repository. The paper also uses publicly available datasets (AlpacaEval, Arena-Hard, HelpSteer3, ChatBot Arena rankings)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications, requirements files, or dependency details are mentioned in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but no commands or reproduction guide is described in the text."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Main results (Tables 3-4) report only point estimates for Spearman's correlations, win rates, and Cohen's Kappa without any confidence intervals or error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper compares benchmark correlations (e.g., ALIGNEVAL 0.885 vs Arena-Hard-SC 0.882 in Table 4) without any significance tests. Claims that ALIGNEVAL 'matches or surpasses' baselines rely on comparing raw numbers."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Spearman's rank correlations are reported throughout (e.g., 0.971, 0.839, 0.938), providing magnitude of the GE-consistency relationship. Win rates and Cohen's Kappa values provide baseline context for interpreting all comparisons."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is provided for why 15 LLMs were selected for the initial study or why 23 total LLMs were used for benchmarking. The selection criteria mention 'succinct coverage of model sizes and families' but no formal justification."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Appendix D shows a leave-one-out stability analysis (Figure 6), but main results in Tables 3-4 have no variance, standard deviation, or spread measures across runs. Evaluations appear to be single-pass."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 2 and §4.2 compare ALIGNEVAL against multiple baselines: AlpacaEval (raw and length-controlled), Arena-Hard (raw and style-controlled), GPT4o-Judge, MixEval, IFEval, and HelpSteer3."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include recent benchmarks: Arena-Hard (2024), WildBench (2025), MixEval (2024), IFEval (2023), and HelpSteer3 (2025). These represent current state-of-the-art LLM evaluation approaches."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table 1 ablates the consistency filtering step (0.971 → 0.793 on Arena-Hard without it). Figure 3 ablates the preference oracle across 15 LLMs. Different instruction sets (AlpacaEval, Arena-Hard, WildBench) are compared. ALIGNEVAL-GPT vs ALIGNEVAL-CLAUDE ablates the oracle choice."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics are used: Spearman's rank correlation for benchmark evaluation (Table 4), Cohen's Kappa for evaluation performance, win rates for generation performance, and accuracy is discussed as an alternative to Cohen's Kappa in §3.2.1."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No direct human evaluation is conducted. ChatBot Arena rankings serve as the gold-standard human preference signal, but these are externally sourced, not collected by the authors."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "ALIGNEVAL is constructed from 15 LLMs' data and then all 23 LLMs (including the original 15) are benchmarked on it. There is no explicit held-out split separating benchmark construction from evaluation."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 3 provides per-benchmark, per-model breakdowns across all 23 LLMs and 10+ benchmark variants. Table 4 breaks down results with and without IFEval combination."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "§4.3 identifies self-preference bias as a failure mode. §5 discusses adversarial vulnerability. §3.3 shows GE-consistency breaks down with weak oracles (e.g., llama-3-8b yields near-zero consistency)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports that GE-consistency on AlpacaEval is substantially lower than Arena-Hard (0.839 vs 0.971). Table 1 shows filtering is critical. §3.3 shows weak oracles fail. §4.3 notes all benchmarks show lower correlations with ChatBot Arena than reported at release."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims ALIGNEVAL 'matches or surpasses' AlpacaEval and Arena-Hard. Table 4 supports this: ALIGNEVAL-CLAUDE (0.885) > Arena-Hard-SC (0.882) and >> AlpacaEval-LC (0.746). The 0.94 combined correlation with IFEval is supported by Table 4 (0.946)."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper is careful to use correlational language ('strong correlation', 'consistency', 'suggests'). The main claim is about correlation between generation and evaluation rankings, and the study design (Spearman's rank correlation) is appropriate for this correlational claim. Ablation of filtering (Table 1) uses controlled manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Claims are generally bounded to the tested settings. The paper specifies results on specific instruction sets (Arena-Hard, AlpacaEval, WildBench), specific oracles (GPT-4o, 15 other LLMs), and specific model sets. §5 frames ALIGNEVAL as a 'proxy evaluation by design.'"
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "§3.2.2 discusses why AlpacaEval shows lower GE-consistency (more open-ended instructions). §4.3 discusses self-preference bias. §5 considers adversarial gaming. §4.2 acknowledges ChatBot Arena is 'not a true gold standard' with potential biases, citing Singh et al."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "§5 explicitly states ALIGNEVAL is 'a proxy evaluation by design' and discusses the gap: evaluating LLMs as judges is a proxy for their alignment capability. The paper acknowledges this could be gamed by fine-tuning a model to be a good judge without improving generation."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Exact model versions are specified: 'gpt-4o-2024-08-06' as oracle, 'gpt-4o-2024-05-13' for evaluation, 'gpt-4-1106-preview', 'gpt-3.5-turbo-0125', and 'gpt-4o-mini-2024-07-18'. Open-weight models include specific size variants (e.g., 'llama-3.3-70b', 'gemma-2-27b'). Tables 5-6 list all models."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Figure 4 (Appendix B) provides the full prompt template for pairwise comparison evaluation, including system message and user message with all instructions. Placeholder variables (INSTRUCTION, OUTPUT_1, OUTPUT_2) are clearly defined."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No temperature, top-p, max tokens, or other sampling hyperparameters are reported for any of the LLM API calls, despite noting that the study requires ~80M tokens per model."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The study involves direct LLM API calls for generation and pairwise comparison evaluation."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "§3.2.1 documents the consistency filtering process: instances where the oracle's predictions differ when output order is swapped are discarded (58.3% on AlpacaEval, 50.7% on Arena-Hard). §4.1 describes reducing to one instance per comparison by random order selection, yielding 2671 instances."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "§5 'Discussion and Conclusion' contains substantive limitations discussion including adversarial vulnerability, proxy evaluation concerns, and the recommendation to combine with IFEval to mitigate risks."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "§5 discusses specific threats: 'fine-tuning an LLM to act as a judge could artificially boost its ALIGNEVAL ranking without meaningfully improving its alignment.' §4.3 identifies specific self-preference bias (ALIGNEVAL-GPT favors GPT, ALIGNEVAL-CLAUDE favors Claude). §4.2 notes ChatBot Arena's 'opaque data collection process and potential biases.'"
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly list what settings, populations, or model types the results do NOT generalize to. §5 notes it 'may be vulnerable to adversarial attacks' but does not state specific untested conditions or excluded scenarios."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The paper states code and data will be released via the GitHub repository (https://github.com/yale-nlp/AlignEval). The NeurIPS checklist confirms 'we will include the constructed dataset and the codebase in the supplemental material.'"
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "§3.2.1 describes the complete data collection procedure: instruction sets from AlpacaEval (805) and Arena-Hard (500), LLM output generation from 15 models, pairwise evaluation by GPT-4o with order-swapping, and consistency filtering."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants were recruited. Data sources are standard public benchmarks (AlpacaEval, Arena-Hard, ChatBot Arena)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: 15 LLMs generate outputs on Arena-Hard (500 instructions) → GPT-4o evaluates pairwise comparisons (2 × 500 × 15 = 15,000 instances) → consistency filtering removes ~50.7% → random order selection yields 2671 final ALIGNEVAL instances."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgements state: 'We are grateful for the TPU compute support provided by the Google TRC program and for the OpenAI API credits support provided by OpenAI's Researcher Access Program.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: Yale University and Shanghai Jiao Tong University. These are academic institutions without direct product ties to the evaluated models."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "OpenAI provided API credits and GPT-4o serves as the primary preference oracle (its evaluations define the benchmark's gold standard). Google provided TPU compute and Gemini models rank highest in the evaluation. Both funders have financial interest in their models performing well."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interest declaration is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "Training data cutoff dates are not stated for any of the 23 evaluated LLMs. This matters because AlpacaEval and Arena-Hard are public benchmarks that could be in training data."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether the AlpacaEval or Arena-Hard instructions (or similar evaluation outputs) appeared in the training data of the evaluated LLMs."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "AlpacaEval (2023) and Arena-Hard (2024) are public benchmarks. Models trained after their release could have seen these instructions or similar outputs. This contamination risk is not addressed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. LLM evaluations are conducted entirely through API calls."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Table 2 reports estimated API costs for evaluating one LLM across benchmarks: AlpacaEval $10, Arena-Hard $20, GPT4o-Judge $2, ALIGNEVAL $0. Footnote 4 notes 'evaluating a single LLM in our study requires around 80M tokens.'"
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The total computational budget is not quantified. Acknowledgements mention Google TPU and OpenAI API credits but do not state total GPU hours, total API spend, or total compute used for the full study."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No results across multiple random seeds. Appendix D shows a leave-one-out stability analysis (removing individual models) but this is not seed sensitivity — it tests sensitivity to model set composition."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is not explicitly stated. Each pairwise comparison is done twice (with order swapping), but it is unclear if the full evaluation pipeline was run more than once."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": false,
    310         "answer": false,
    311         "justification": "This is a benchmark construction and analysis paper; there is no hyperparameter search involved."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "The selection of Arena-Hard + GPT-4o + consistency filtering as the best configuration is systematically justified through comparisons: §3.2.2 compares instruction sets (Arena-Hard > AlpacaEval), §3.3 compares oracles (GPT-4o best), Table 1 shows filtering importance."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors construct ALIGNEVAL using the same evaluation paradigm they are comparing against (LLMs-as-Judges), then evaluate ALIGNEVAL against these same paradigms using the same gold standard (ChatBot Arena). This methodological circularity is not discussed."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "The paper does not compare methods at different compute levels; the main comparison is between benchmark paradigms, not compute budgets."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "§5 explicitly discusses construct validity: ALIGNEVAL is 'a proxy evaluation by design' that measures evaluation capability as a stand-in for alignment. §4.2 acknowledges ChatBot Arena as an imperfect gold standard, citing Singh et al. on 'the leaderboard illusion.'"
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved in this study. LLMs are called directly via API for generation and evaluation tasks."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. AlpacaEval (2023) and Arena-Hard (2024) are public benchmarks whose instructions could appear in training data of later models like Gemini 2.0 or Llama 3.3."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information. For instance, when evaluating LLMs as judges, the pairwise format with specific baseline outputs could be memorized."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The same 15 LLMs used to construct ALIGNEVAL instances are also evaluated on the benchmark. The 8 additional LLMs provide some independence but this overlap is not discussed as a potential concern."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention methods (canary strings, n-gram overlap, decontamination) are applied to any of the benchmark data."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "There is a strong GE-consistency (Spearman's ρ = 0.971) between LLMs' generation and evaluation capability rankings on Arena-Hard with GPT-4o as the preference oracle.",
    370       "evidence": "Figure 2 shows the scatter plot of generation vs evaluation performance for 15 LLMs on Arena-Hard with 0.971 Spearman's correlation (§3.2.2).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Consistency filtering of evaluation instances significantly improves GE-consistency (from 0.793 to 0.971 on Arena-Hard).",
    375       "evidence": "Table 1 shows the Spearman's correlation with and without filtering on both AlpacaEval (0.743→0.839) and Arena-Hard (0.793→0.971).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Stronger LLMs as preference oracles produce higher GE-consistency.",
    380       "evidence": "Figure 3 shows GE-consistency across 15 LLMs as oracles, with larger/more capable models consistently yielding higher consistency. Small models like llama-3-8b produce near-zero GE-consistency.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "ALIGNEVAL matches or surpasses widely used automatic LLM evaluation benchmarks in correlating with human preferences (ChatBot Arena rankings).",
    385       "evidence": "Table 4 shows ALIGNEVAL-CLAUDE achieves 0.885 Spearman's correlation vs Arena-Hard-SC 0.882 and AlpacaEval-LC 0.746. Combined with IFEval, ALIGNEVAL+ achieves 0.946.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "ALIGNEVAL exhibits self-preference bias: ALIGNEVAL-GPT ranks GPT-4o second, while ALIGNEVAL-CLAUDE ranks Claude 3.5 Sonnet highest.",
    390       "evidence": "§4.3 and Table 3 show the bias pattern across both oracle variants.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "GE-consistency generalizes across instruction types, with 0.938 Spearman's correlation observed on WildBench.",
    395       "evidence": "Appendix C, Figure 5 shows the scatter plot on WildBench with 0.938 correlation, which has a more balanced instruction type distribution than Arena-Hard.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "All automatic alignment benchmarks show lower correlations with ChatBot Arena than reported at their original release.",
    400       "evidence": "§4.3 finding (4) notes this trend, attributing it to stronger LLMs making evaluation more challenging. Appendix E provides non-style-controlled correlations.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "The paper demonstrates strong generation-evaluation consistency (GE-consistency) among LLMs: models that generate better-aligned outputs also tend to evaluate alignment more accurately, with Spearman's ρ = 0.971 on Arena-Hard using GPT-4o as oracle. Building on this finding, the authors propose ALIGNEVAL, a benchmark that assesses LLM alignment by testing their evaluation capabilities rather than their generated outputs, achieving competitive or superior correlation (0.885-0.946) with ChatBot Arena rankings compared to existing LLM-as-judge benchmarks while requiring no additional LLM judge calls. The study also reveals that GE-consistency depends critically on using strong preference oracles and consistency filtering of evaluation instances.",
    408   "red_flags": [
    409     {
    410       "flag": "Funder conflicts of interest",
    411       "detail": "OpenAI provided API credits and GPT-4o serves as the primary preference oracle whose judgments define the benchmark's gold labels. Google provided TPU compute and Gemini models rank highest in the final evaluation (Table 3). Neither conflict is discussed."
    412     },
    413     {
    414       "flag": "No significance tests for benchmark comparisons",
    415       "detail": "Table 4 compares Spearman's correlations (e.g., ALIGNEVAL-CLAUDE 0.885 vs Arena-Hard-SC 0.882) without any significance tests. The differences are small enough that they may not be statistically meaningful, yet the paper claims ALIGNEVAL 'matches or surpasses' these baselines."
    416     },
    417     {
    418       "flag": "Benchmark contamination ignored",
    419       "detail": "AlpacaEval and Arena-Hard are public benchmarks that could appear in the training data of the evaluated LLMs. No training cutoffs are stated and no contamination analysis is performed, despite this being a study about LLM evaluation reliability."
    420     },
    421     {
    422       "flag": "Overlap between construction and evaluation sets",
    423       "detail": "The 15 LLMs used to construct ALIGNEVAL instances (whose outputs and oracle judgments form the benchmark) are also among the 23 LLMs evaluated on it. This creates a circularity where the benchmark's structure is partially shaped by the models it evaluates."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Chatbot arena: An open platform for evaluating LLMs by human preference",
    429       "authors": ["W.-L. Chiang", "L. Zheng", "Y. Sheng"],
    430       "year": 2024,
    431       "relevance": "Core reference for human-preference-based LLM evaluation and the gold-standard leaderboard used to validate ALIGNEVAL."
    432     },
    433     {
    434       "title": "AlpacaEval: An automatic evaluator of instruction-following models",
    435       "authors": ["X. Li", "T. Zhang", "Y. Dubois"],
    436       "year": 2023,
    437       "relevance": "Major LLM-as-judge evaluation benchmark used as both a data source and baseline in this study."
    438     },
    439     {
    440       "title": "From crowdsourced data to high-quality benchmarks: Arena-hard and benchbuilder pipeline",
    441       "authors": ["T. Li", "W.-L. Chiang", "E. Frick"],
    442       "year": 2024,
    443       "arxiv_id": "2406.11939",
    444       "relevance": "LLM evaluation benchmark forming the primary instruction set for ALIGNEVAL construction and a key baseline."
    445     },
    446     {
    447       "title": "RewardBench: Evaluating reward models for language modeling",
    448       "authors": ["N. Lambert", "V. Pyatkin", "J. Morrison"],
    449       "year": 2024,
    450       "arxiv_id": "2403.13787",
    451       "relevance": "Benchmark for evaluating reward models including LLMs as generative reward models, closely related to LLM-as-judge evaluation."
    452     },
    453     {
    454       "title": "Instruction-following evaluation for large language models",
    455       "authors": ["J. Zhou", "T. Lu", "S. Mishra"],
    456       "year": 2023,
    457       "arxiv_id": "2311.07911",
    458       "relevance": "IFEval benchmark used as a complementary judge-free evaluation approach combined with ALIGNEVAL to achieve 0.946 correlation."
    459     },
    460     {
    461       "title": "Judging llm-as-a-judge with mt-bench and chatbot arena",
    462       "authors": ["L. Zheng", "W.-L. Chiang", "Y. Sheng"],
    463       "year": 2023,
    464       "relevance": "Foundational work on the LLM-as-judge paradigm that this paper builds upon and seeks to improve."
    465     },
    466     {
    467       "title": "Self-rewarding language models",
    468       "authors": ["W. Yuan", "R. Y. Pang", "K. Cho"],
    469       "year": 2024,
    470       "relevance": "Demonstrates self-improvement through self-evaluation, directly related to generation-evaluation consistency implications."
    471     },
    472     {
    473       "title": "The generative AI paradox: what it can create, it may not understand",
    474       "authors": ["P. West", "X. Lu", "N. Dziri"],
    475       "year": 2024,
    476       "relevance": "Key related work proposing generation-evaluation gaps in LLMs, which this paper's GE-consistency analysis directly addresses."
    477     },
    478     {
    479       "title": "WildBench: Benchmarking LLMs with challenging tasks from real users in the wild",
    480       "authors": ["B. Y. Lin", "Y. Deng", "K. Chandu"],
    481       "year": 2025,
    482       "relevance": "LLM alignment benchmark used as additional validation for GE-consistency with balanced instruction distribution."
    483     },
    484     {
    485       "title": "Benchmarking and improving generator-validator consistency of language models",
    486       "authors": ["X. L. Li", "V. Shrivastava", "S. Li"],
    487       "year": 2024,
    488       "relevance": "Directly related prior work on GV-consistency which this paper extends to the ranking-level GE-consistency concept."
    489     },
    490     {
    491       "title": "MixEval: Deriving wisdom of the crowd from LLM benchmark mixtures",
    492       "authors": ["J. Ni", "F. Xue", "X. Yue"],
    493       "year": 2024,
    494       "relevance": "Alternative LLM evaluation approach that reduces reliance on LLM judges by matching user queries to benchmark examples."
    495     },
    496     {
    497       "title": "The leaderboard illusion",
    498       "authors": ["S. Singh", "Y. Nan", "A. Wang"],
    499       "year": 2025,
    500       "arxiv_id": "2504.20879",
    501       "relevance": "Discusses biases and limitations in benchmark leaderboards including ChatBot Arena, relevant to validity of gold standard."
    502     },
    503     {
    504       "title": "RMB: Comprehensively benchmarking reward models in LLM alignment",
    505       "authors": ["E. Zhou", "G. Zheng", "B. Wang"],
    506       "year": 2025,
    507       "relevance": "Shows that frontier LLMs as judges/GRMs are competitive with fine-tuned reward models, supporting the GE-consistency premise."
    508     },
    509     {
    510       "title": "Mind the gap: Examining the self-improvement capabilities of large language models",
    511       "authors": ["Y. Song", "H. Zhang", "C. Eisenach"],
    512       "year": 2025,
    513       "relevance": "Demonstrates the verification-generation gap enabling LLM self-improvement, key related work on generation vs evaluation capabilities."
    514     }
    515   ],
    516   "engagement_factors": {
    517     "practical_relevance": {
    518       "score": 2,
    519       "justification": "ALIGNEVAL provides a cost-free benchmark for evaluating LLM alignment without needing LLM judges, useful for researchers and developers building evaluation pipelines."
    520     },
    521     "surprise_contrarian": {
    522       "score": 1,
    523       "justification": "The finding that evaluation capability predicts generation quality is somewhat expected for capable models; the quantification (ρ=0.97) is notable but not shocking."
    524     },
    525     "fear_safety": {
    526       "score": 0,
    527       "justification": "No safety or security concerns raised; the paper is about evaluation methodology."
    528     },
    529     "drama_conflict": {
    530       "score": 1,
    531       "justification": "Mild tension in showing established benchmarks (AlpacaEval, MixEval) perform worse than claimed with newer models, and that a zero-cost alternative can match paid approaches."
    532     },
    533     "demo_ability": {
    534       "score": 2,
    535       "justification": "GitHub repository with benchmark data is publicly available; researchers can run ALIGNEVAL on their models without API costs."
    536     },
    537     "brand_recognition": {
    538       "score": 1,
    539       "justification": "Yale University is well-known but not a tier-1 AI lab. Published at NeurIPS 2025. Uses GPT-4o and Claude as reference points."
    540     }
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs