scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28810B)
      1 {
      2   "paper": {
      3     "title": "Systematic Evaluation of LLM-as-a-Judge in LLM Alignment Tasks: Explainable Metrics and Diverse Prompt Templates",
      4     "authors": [
      5       "Hui Wei",
      6       "Shenghua He",
      7       "Tian Xia",
      8       "Fei Liu",
      9       "Andy Wong",
     10       "Jingyang Lin",
     11       "Mei Han"
     12     ],
     13     "year": 2024,
     14     "venue": "Building Trust Workshop at ICLR 2025",
     15     "arxiv_id": "2408.13006",
     16     "doi": "10.48550/arXiv.2408.13006"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Code is released at https://github.com/shenghh2015/llm-judge-eval, linked in the paper header."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses two publicly available datasets: TL;DR Summarization (Völske et al., 2017; Stiennon et al., 2020) and HH-RLHF-Helpfulness (Bai et al., 2022). Both are standard public benchmarks."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No environment specifications (requirements.txt, Dockerfile, dependency versions) are provided in the paper."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While the experimental setup is described in Sections 5-6 and code is open-sourced, the paper does not include step-by-step reproduction instructions with commands to run."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Standard deviations are reported across 5 splits for all main metrics. Tables show values like '0.665 (0.003)' where parenthetical values are standard deviations."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims 'significant impact of prompt templates' and 'significant negative correlation' between accuracy and position bias without reporting any formal statistical tests (no p-values, t-tests, or correlation coefficients with significance)."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Raw accuracy values, position bias magnitudes, and length bias values are reported with context. For example, accuracies range from <0.2 to ~0.7, and specific differences between LLMs and templates are shown, providing clear magnitude context."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The sample of 200 per split (1000 total) is justified only by budget constraints: 'it is highly time-consuming and expensive to evaluate LLM judges on all the data cases.' No power analysis or formal justification for sample adequacy is provided."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Mean and standard deviation across 5 non-overlapping splits are reported for all metrics (accuracy, position bias, length bias) throughout Tables 7-14."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "GPT-3.5-turbo 'serves as the baseline in our experiments' (Section 6), with GPT-4o and GPT-4o-mini compared against it."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "GPT-4o (checkpoint 08/06/2024) and GPT-4o-mini were the latest available models. The paper explains choosing GPT-4o over GPT-4 due to comparable performance at lower cost."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No ablation study is conducted on the evaluation framework components. While the paper varies LLMs and templates parametrically, it does not remove or modify individual framework components to measure their contribution."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Multiple metrics are used: Accboth, Accrandom, position bias, length bias, and self-consistency rate (Section 4)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No new human evaluation is conducted. The paper relies on existing human preference labels from the TL;DR and HH-RLHF datasets as ground truth, but does not have humans evaluate the framework outputs or LLM judge decisions."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "For TL;DR, they sample from the existing test set following Rafailov et al. (2024b). For HH-RLHF, they sample from the full dataset. The 5 non-overlapping splits are all used for evaluation with no tuning on the same data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down per LLM model, per prompt template, and per dataset. Rankings tables (Tables 7-14) show per-template and per-LLM-judge breakdowns."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The paper notes that 'several LLM judges have very low accuracies (Accboth < 0.2)' but does not analyze specific failure cases qualitatively or explain why particular LLM-template combinations fail."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative findings are reported: all accuracies below 0.7 showing 'mediocre alignment,' some judges with Accboth < 0.2, Accrandom being a less effective metric than Accboth, and that complete self-consistency is unachievable even at temperature 0.0."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims (1) significant impact of prompt templates on judge performance — supported by Figures 2-3 and Tables 7-14, and (2) mediocre alignment between LLM judges and humans — supported by all Accboth values < 0.7."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's claims about template impact are supported by a factorial design (3 LLMs × multiple templates) that controls for model identity when varying templates and vice versa. The study design is adequate for the claims made, such as 'the position bias/preference depends on both LLMs themselves and also prompt templates.'"
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The Limitations section (Section 8) explicitly bounds scope to commercial LLMs and LLM-as-a-Judge methods, noting that open-source LLMs and reward models are not tested. The title appropriately scopes to 'LLM Alignment Tasks.'"
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper offers speculative explanations (e.g., length bias may result from 'over-alignment of commercial models') but does not systematically consider confounds or alternative explanations. No robustness checks are performed."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper clearly measures agreement between LLM judges and human preference labels, and frames this as 'alignment with human preferences.' The measurement and framing are well-matched, and the paper acknowledges that human labels are treated as ground truth."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "GPT-4o is described with 'latest checkpoint on 08/06/2024' but GPT-4o-mini and GPT-3.5-turbo are referred to only by marketing names without version identifiers or snapshot dates."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full prompt template text is provided in Appendix A.1, including complete templates from Rafailov et al., Wang et al., Shen et al., and others for both datasets."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Temperature is systematically studied (0.0, 0.1, 0.3, 0.5, 0.7) with temperature 0.1 selected for main experiments (Section 6). K=5 repetitions for flipping noise estimation is also stated."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The paper evaluates LLMs directly via API calls with prompt templates."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 6 details data preprocessing: deduplication of shared prompts ('only one pair is kept'), stratified sampling to preserve length-difference ratios, creation of 5 non-overlapping splits of 200 samples each."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 8 'Limitations and Future Work' is a dedicated section discussing specific limitations of the study."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 8 discusses specific limitations: only commercial LLMs tested (not open-source), only LLM-as-a-Judge methods evaluated (not reward models), and plans to expand to specific models (Llama 3.1, Nemotron-4-340B-Reward) and datasets (RewardBench)."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper explicitly states what was not tested: 'our current studies focus on commercial LLMs rather than open-source LLMs' and 'our evaluation studies concentrate on LLM-as-a-Judge methods' rather than reward models."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "While the source datasets are public and code is open-sourced, the raw LLM judgment outputs (1000 samples × 5 repetitions × 24-30 judges) are not explicitly released."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 6 describes data collection in detail: source datasets, deduplication procedure, stratified sampling strategy, and split creation process with specific ratios (e.g., 115:85 for summarization)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants were recruited. Data sources are standard public benchmarks (TL;DR Summarization, HH-RLHF-Helpfulness)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 6 documents the full pipeline: dataset selection → deduplication of shared prompts → stratified sampling → 5 non-overlapping splits of 200 samples each. The framework pipeline is depicted in Figure 1."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding acknowledgment or grant information is provided in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are listed: UC Merced, PAII Inc., Emory University, Inflection AI, University of Rochester. Work done at PAII Inc. is noted."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Funding source is not disclosed, so independence cannot be assessed. Authors from PAII Inc. (an AI company) and Inflection AI could have interests in the outcome."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is provided. Authors from AI companies (PAII Inc., Inflection AI) may have financial interests that are not declared."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for any of the three models (GPT-4o, GPT-4o-mini, GPT-3.5-turbo). The models may have been trained on both evaluation datasets."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether the LLM judges' training data includes the TL;DR (2020) or HH-RLHF (2022) datasets and their human preference labels."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "Both TL;DR Summarization (2017/2020) and HH-RLHF (2022) were published well before any of the tested models' training cutoffs. The models likely saw these datasets during training, but this contamination risk is not discussed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The paper uses existing human-labeled datasets."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The paper mentions 'limited budget' and that GPT-4o is '4 to 6 times lower' cost than GPT-4, but does not report actual API costs, tokens consumed, or cost per evaluation."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No total computational budget is stated. The paper mentions budget constraints as motivation for design choices but never quantifies the actual spend."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Results are reported across 5 non-overlapping random splits with mean and standard deviation, serving the same purpose as seed sensitivity analysis. Additionally, K=5 repetitions per sample are used to estimate flipping noise."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "K=5 repetitions per sample for flipping noise estimation and S=5 splits for metrics computation are explicitly stated in Section 6."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "While 5 temperature values were tested (0.0, 0.1, 0.3, 0.5, 0.7), the total compute spent on this search is not reported."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Temperature 0.1 is selected based on Table 1 results showing it has the highest self-consistency among non-zero temperatures, and the authors justify wanting 'a value that is not a special case, such as 0.0' to demonstrate generalizability."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Many comparisons are made across 24-30 LLM judges and multiple metrics, but no correction for multiple comparisons (Bonferroni, Holm, etc.) is applied."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors propose new metrics and evaluate them using their own framework, but do not acknowledge potential bias in this self-evaluation setup."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": false,
    329         "answer": false,
    330         "justification": "All compared LLM judges use similar API calls; compute differences between approaches are negligible."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses TL;DR and HH-RLHF human preference labels as ground truth without discussing whether these labels are reliable measures of true human preferences or whether annotator disagreement affects the benchmark's validity."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved; LLMs are evaluated directly via API calls."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Both benchmark datasets (TL;DR 2017/2020, HH-RLHF 2022) were published years before the tested models were trained. The models may have seen the preference labels during training, but this temporal leakage is not discussed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks answer information. The models may have learned the human preference patterns for these specific datasets during training."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "While the paper creates 5 non-overlapping evaluation splits, it does not address whether the LLMs' training data included these benchmark examples, creating a fundamental independence violation."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention methods are used (no canary strings, no membership inference, no decontamination)."
    363       }
    364     }
    365   },
    366   "scan_version": 3,
    367   "active_modules": [
    368     "experimental_rigor",
    369     "data_leakage"
    370   ],
    371   "claims": [
    372     {
    373       "claim": "Prompt templates have a significant impact on LLM judge performance",
    374       "evidence": "Figures 2a-2b show accuracy (Accboth) varies substantially across templates for the same LLM, with some judges scoring below 0.2 and the best around 0.67. Tables 7-14 show template rankings differ across models.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "All tested LLM judges show mediocre alignment with human evaluators (Accboth < 0.7)",
    379       "evidence": "Section 7 states 'all the accuracies on both datasets are below 0.7' as shown in Figures 2a and 2b across all 24-30 LLM judge configurations.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "There is a significant negative correlation between accuracy and the level of position bias",
    384       "evidence": "Figures 4a and 4b show scatter plots of |PB| vs Accboth for TL;DR and HH-RLHF datasets, showing higher-accuracy judges tend to have lower position bias. However, no formal correlation coefficient or statistical test is reported.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "All tested LLM judges prefer longer responses more than human evaluators",
    389       "evidence": "Figures 3c and 3d show positive length bias for all judges on both datasets. Section 7 states 'all the tested LLM judges have stronger preferences for longer responses compared to human judges.'",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Higher temperatures reduce self-consistency without significantly affecting accuracy",
    394       "evidence": "Table 1 shows self-consistent rate decreases from 0.977 (T=0.0) to 0.946 (T=0.7) for GPT-4o on TL;DR, while accuracy remains stable between 0.657 and 0.668. Tables 4-6 confirm the same trend for other models.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "GPT-4o and GPT-4o-mini have higher accuracies than GPT-3.5-turbo regardless of prompt template",
    399       "evidence": "Section 7 states 'both GPT-4o and GPT-4o-mini have higher accuracies no matter which prompt template is used' as shown in Figures 2a-2b and ranking tables.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Position bias depends on both the LLM and the prompt template, with the same template causing opposite biases in different models",
    404       "evidence": "Section 7 states 'varying prompt templates can cause the same large language model to exhibit preferential biases towards different positions' and 'different large language models can show opposite position preferences using the same template.' Shown in Figures 3a-3b.",
    405       "supported": "strong"
    406     },
    407     {
    408       "claim": "Accboth is a more effective metric than Accrandom for assessing LLM judge performance",
    409       "evidence": "Section 7 shows that the gap between GPT-3.5-turbo and newer models shrinks with Accrandom because it 'involves randomly selecting a position when LLM judge selection is inconsistent across two positions, thereby not reflecting the internal capabilities of LLM judges.'",
    410       "supported": "moderate"
    411     }
    412   ],
    413   "methodology_tags": [
    414     "benchmark-eval"
    415   ],
    416   "key_findings": "LLM-as-a-Judge performance is highly sensitive to prompt templates, with accuracy varying from below 0.2 to about 0.67 depending on the template used. All tested LLM judges (GPT-4o, GPT-4o-mini, GPT-3.5-turbo) show mediocre alignment with human preferences (Accboth < 0.7) and exhibit stronger length bias toward longer responses than human evaluators. The paper introduces theoretically grounded metrics that explicitly model and mitigate LLM self-inconsistency (flipping noise) from position and length bias measurements. A negative correlation between accuracy and position bias magnitude suggests that more capable LLM judges are also less positionally biased.",
    417   "red_flags": [
    418     {
    419       "flag": "Claims of significance without statistical tests",
    420       "detail": "The paper repeatedly claims 'significant impact' and 'significant negative correlation' but never reports formal statistical tests (p-values, correlation coefficients with significance tests). The word 'significant' is used informally, not statistically."
    421     },
    422     {
    423       "flag": "Contamination risk unaddressed",
    424       "detail": "Both evaluation datasets (TL;DR Summarization 2017/2020, HH-RLHF 2022) were published well before the tested models' training cutoffs. The models may have seen these datasets and their human preference labels during training, potentially inflating or distorting alignment scores. This is never discussed."
    425     },
    426     {
    427       "flag": "Small sample size without power analysis",
    428       "detail": "200 samples per split (1000 total) from datasets of 124K-143K examples is motivated only by budget constraints. No power analysis or formal justification that this sample size is sufficient for the claims made, particularly for the cross-template comparisons."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "GPT-4 Technical Report",
    434       "authors": ["Josh Achiam"],
    435       "year": 2023,
    436       "arxiv_id": "2303.08774",
    437       "relevance": "Primary commercial LLM family evaluated as judges; foundational model for LLM-as-a-Judge research."
    438     },
    439     {
    440       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    441       "authors": ["Lianmin Zheng", "Wei-Lin Chiang"],
    442       "year": 2024,
    443       "relevance": "Foundational work on LLM-as-a-Judge evaluation methodology; provides position bias definitions and prompt templates used in this study."
    444     },
    445     {
    446       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    447       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    448       "year": 2024,
    449       "arxiv_id": "2305.18290",
    450       "relevance": "Key LLM alignment algorithm; provides prompt templates used across experiments in this study."
    451     },
    452     {
    453       "title": "Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback",
    454       "authors": ["Yuntao Bai"],
    455       "year": 2022,
    456       "arxiv_id": "2204.05862",
    457       "relevance": "Source of the HH-RLHF-Helpfulness dataset used as one of two main evaluation benchmarks."
    458     },
    459     {
    460       "title": "Large Language Models Are Not Fair Evaluators",
    461       "authors": ["Peiyi Wang"],
    462       "year": 2023,
    463       "arxiv_id": "2305.17926",
    464       "relevance": "Directly related work on position bias in LLM-based evaluators."
    465     },
    466     {
    467       "title": "Large Language Models Are Inconsistent and Biased Evaluators",
    468       "authors": ["Rickard Stureborg"],
    469       "year": 2024,
    470       "arxiv_id": "2405.01724",
    471       "relevance": "Directly studies LLM judge inconsistency and bias, a core concern of this paper."
    472     },
    473     {
    474       "title": "A Survey on LLM-as-a-Judge",
    475       "authors": ["Jiawei Gu"],
    476       "year": 2024,
    477       "arxiv_id": "2411.15594",
    478       "relevance": "Comprehensive survey of LLM-as-a-Judge methods and their applications."
    479     },
    480     {
    481       "title": "Judging the Judges: A Systematic Investigation of Position Bias in Pairwise Comparative Assessments by LLMs",
    482       "authors": ["Lin Shi"],
    483       "year": 2024,
    484       "arxiv_id": "2406.07791",
    485       "relevance": "Systematic study of position bias in LLM judges, closely related to this paper's position bias analysis."
    486     },
    487     {
    488       "title": "Judging the Judges: Evaluating Alignment and Vulnerabilities in LLMs-as-Judges",
    489       "authors": ["Aman Singh Thakur"],
    490       "year": 2024,
    491       "arxiv_id": "2406.12624",
    492       "relevance": "Evaluates alignment and vulnerabilities in LLM judge systems."
    493     },
    494     {
    495       "title": "LLM Evaluators Recognize and Favor Their Own Generations",
    496       "authors": ["Arjun Panickssery"],
    497       "year": 2024,
    498       "arxiv_id": "2404.13076",
    499       "relevance": "Studies self-preference bias in LLM evaluators, a form of judge bias."
    500     },
    501     {
    502       "title": "Verbosity Bias in Preference Labeling by Large Language Models",
    503       "authors": ["Keita Saito"],
    504       "year": 2023,
    505       "arxiv_id": "2310.10076",
    506       "relevance": "Directly studies length/verbosity bias in LLM preference labeling, a core metric in this paper."
    507     },
    508     {
    509       "title": "Learning to Summarize with Human Feedback",
    510       "authors": ["Nisan Stiennon"],
    511       "year": 2020,
    512       "relevance": "Source of the TL;DR summarization dataset used as one of two main evaluation benchmarks."
    513     }
    514   ],
    515   "engagement_factors": {
    516     "practical_relevance": {
    517       "score": 2,
    518       "justification": "Provides an open-source framework practitioners can use to evaluate LLM judges before deploying them for alignment tasks."
    519     },
    520     "surprise_contrarian": {
    521       "score": 1,
    522       "justification": "The sensitivity to prompt templates and mediocre alignment levels are somewhat expected findings, though the magnitude is documented rigorously."
    523     },
    524     "fear_safety": {
    525       "score": 0,
    526       "justification": "No AI safety or security concerns are raised; the paper focuses on evaluation methodology."
    527     },
    528     "drama_conflict": {
    529       "score": 0,
    530       "justification": "No controversy or conflict angle; a straightforward empirical evaluation."
    531     },
    532     "demo_ability": {
    533       "score": 2,
    534       "justification": "Open-source code on GitHub allows users to evaluate their own LLM judges with custom templates."
    535     },
    536     "brand_recognition": {
    537       "score": 1,
    538       "justification": "Evaluates GPT-4o which is well-known, but the paper is from mid-tier academic institutions and PAII Inc."
    539     }
    540   }
    541 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs