ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32302B)


      1 {
      2   "paper": {
      3     "title": "Evaluating Judges as Evaluators: The JETTS Benchmark of LLM-as-Judges as Test-Time Scaling Evaluators",
      4     "authors": [
      5       "Yilun Zhou",
      6       "Austin Xu",
      7       "Peifeng Wang",
      8       "Caiming Xiong",
      9       "Shafiq Joty"
     10     ],
     11     "year": 2025,
     12     "venue": "International Conference on Machine Learning",
     13     "arxiv_id": "2504.15253",
     14     "doi": "10.48550/arXiv.2504.15253"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository link provided in the paper header: https://github.com/SalesforceAIResearch/jetts-benchmark."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Sec. 3.1 states 'we pre-compute the model responses and release them as part of the benchmark wherever possible.' All datasets used (GSM8k, MATH, CHAMP, HumanEval+, MBPP+, BigCodeBench, AlpacaEval, IFEval) are publicly available benchmarks."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Hardware is mentioned (1 A100 40GB for ≤12B models, 8xA100 40GB for ≥8x7B models) and vLLM is named as the inference framework (App. A.2), but no requirements.txt, Dockerfile, or detailed dependency specifications are provided in the paper."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions appear in the paper text. While a GitHub repository is linked, the paper itself does not contain a 'Reproducing Results' section or equivalent."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Results in Tables 3-10 report point estimates only. Significance markers (*, **, ***) are shown on figures but no confidence intervals or ± notation appears. Figure 7 shows min/max ranges from tied responses but these are not statistical error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Sec. 4.1: 'we perform statistical analyses of reported quantity differing from the baseline value (0 for reranking and beam-search, and 1 for refinement) using a two-sided one-sample t-test and indicate the significance as \"*\" for p ≤0.05, \"**\" for p ≤0.01, \"***\" for p ≤0.001.'"
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Normalized helpfulness (Eq. 1) contextualizes judge performance relative to greedy (baseline) and oracle (upper bound), providing magnitude context. Effective improvement ratio (Eq. 3) similarly reports relative magnitudes. Full normalized values are in Tables 11-15."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Dataset sizes are listed in Table 1 (ranging from 164 for HumanEval+ to 1,324 for MATH) but no justification is given for why these sizes are adequate, nor is a power analysis performed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Responses are pre-computed and fixed to minimize randomness, but this means only single-run results are reported. No standard deviation, IQR, or variance across repeated experiments is provided. The min/max ranges in Fig. 7 reflect tied-response ambiguity, not experimental variance."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Multiple baselines included: random reranking, greedy decoding, majority vote (for math), vanilla Llama-3.1-8B-Instruct with judge prompt, three outcome reward models (OffsetBias-RM, Skywork-Reward-8B, Skywork-Reward-27B), and a process reward model (Qwen2.5-Math-PRM-7B)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baseline reward models are recent: Skywork-Reward-v0.2, Qwen2.5-Math-PRM, OffsetBias-RM. Generator models include recent releases (Qwen-2.5, Llama-3.1). All judge models are from 2024."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple ablation-style analyses: pairwise vs. single-rating protocols (Sec. 4.2), Likert vs. additive rubrics, domain-specific vs. generic prompts (Fig. 23), lookahead vs. non-lookahead beam search (Sec. 4.3), judge size ratio effects (Fig. 5)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Multiple metrics used: normalized helpfulness (Eq. 1) for reranking and beam search, effective improvement ratio (Eq. 3) for refinement, plus dataset-specific metrics (accuracy via Math-Verify, Pass@1, win rate, prompt-level strict accuracy per Table 1)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "App. B.3: 'We inspect over 100 refinements and find two major issues.' The authors manually inspected critique-response pairs and identified two failure modes (false positives from style focus, false negatives from over-scrutinizing style), shown in Figs. 27-28."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Standard benchmark test sets are used (GSM8k, MATH, HumanEval+, MBPP+, BigCodeBench, AlpacaEval, IFEval, CHAMP). Judge models are not tuned on these benchmarks during this study."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Extensive breakdowns provided: per-dataset (Fig. 3, Tables 3-10), per-judge (Fig. 4), per-generator (Figs. 6, 13), per-task-category (math/code/instruction following), and per-protocol (pairwise/Likert/additive). Full raw tables in Appendix."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Failure cases discussed extensively: negative helpfulness on code tasks (Fig. 4), judges worse than greedy on multiple benchmarks, detailed qualitative failure analysis of critiques in App. B.3 with specific examples (Figs. 27-28)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Major negative findings reported: judges consistently worse than PRMs in beam search, critique-based refinement never achieves effective improvement >1.0 (Fig. 11), domain-specific prompts hurt performance (Fig. 23), code evaluation performance is negative for most judges, single-rating protocols produce over-lenient results."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are supported: 'judges are competitive with outcome reward models in reranking' (Fig. 4, Best RM comparison), 'consistently worse than process reward models in beam search' (Fig. 9, QPRM comparison), 'natural language critiques are currently ineffective' (Fig. 11, all δ(Eff) < 1.0)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Claims are appropriately hedged as observational: 'judge-specific finetuning seems to primarily boost instruction-following evaluation abilities' (Sec. 4.2), 'This may suggest an intrinsic pliability' (Sec. 4.4). Controlled comparisons (same setup, varying one factor) support the comparative claims adequately."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Claims are bounded by 'current' temporal qualifier and refer to the tested judges. The paper tests 10 judges across 3 domains and 8 generators, providing broad but bounded coverage. The title correctly identifies this as 'The JETTS Benchmark' rather than making universal claims."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "Limited systematic discussion of alternatives. For the main finding that critiques are ineffective, the paper identifies failure modes (style over substance) but doesn't consider whether the refinement prompt design (Fig. 18), generator capabilities, or other factors could explain the results. The JETTS vs. RewardBench difficulty gap discussion (Sec. 1) is a notable exception."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Metrics match claims well. Normalized helpfulness (Eq. 1) directly measures what is claimed (judge improvement over greedy baseline, relative to oracle). The paper does not conflate benchmark performance with broader capabilities — claims are about judge performance in specific test-time scaling settings."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific model names with versions and sizes provided in Table 2: e.g., 'Llama-3.1-8B-Instruct', 'Qwen-2.5-72B-Instruct', 'SFR-Judge-70B', 'Skywork-Critic-8B'. For open-source models, these names uniquely identify checkpoints. GPT-4o is used only for CHAMP grading, not as primary subject."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full prompt texts provided in Appendix: pairwise evaluation (Fig. 15), Likert rating (Fig. 16), additive rating (Fig. 17), refinement prompt (Fig. 18), and domain-specific evaluation criteria (Fig. 19). The partial response note for beam search is also included."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Sampling parameters reported: temperature 1.0, top-p 0.95 (Sec. 3.2). Beam search: (10,2) configuration with depth limit 10 (Sec. 3.3). Refinement: (1,9) setup (Sec. 3.4). Integer rating scale 1-5 specified."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The benchmark evaluates judge models directly on reranking, beam search, and refinement tasks without an agentic framework."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Response generation procedure documented: greedy + 9 sampled responses per query (Sec. 3.2). Evaluation procedures detailed per dataset in App. A.1. Pre-computation and storage of all responses and GPT-4o evaluations to ensure consistency. Pairwise consistency check with position swapping described (App. B.1)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Sec. 5 ('Conclusion and Future Work') identifies specific limitations with corresponding research directions: the pairwise performance-efficiency dilemma (O(N²)), insufficient CoT reasoning for judgment and critiques, and the need for better reasoning capabilities. This constitutes substantive discussion."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No dedicated threats-to-validity discussion. Some specific methodological concerns are noted in passing (CHAMP oracle overestimation in App. B.2, positional bias mitigation in App. B.1), but these are not framed as threats to validity and no systematic analysis of how these could affect conclusions is provided."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit 'what the results do NOT show' statements. The paper does not explicitly state what settings, populations, or claims are excluded from its scope. The practitioner note (Sec. 3.5) offers practical guidance but does not delineate scope boundaries."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Pre-computed model responses are released as part of the benchmark (Sec. 3.1: 'we pre-compute the model responses and release them as part of the benchmark wherever possible'). This allows independent verification of judge evaluations."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data collection described: 8 public benchmarks listed with sizes and metrics (Table 1), response generation procedure (greedy + 9 sampled with temperature 1.0, top-p 0.95), evaluation methods per dataset detailed in App. A.1."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. All data comes from standard public benchmarks (GSM8k, MATH, CHAMP, HumanEval+, MBPP+, BigCodeBench, AlpacaEval, IFEval)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Full pipeline documented: query selection from datasets → response generation (greedy + sampled) → judge evaluation (pairwise or single-rating) → metric computation (normalized helpfulness, improvement ratio). Pre-computation of GPT-4o and GPT-4 Turbo evaluations for consistent grading across judges."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding disclosure or acknowledgments section in the paper. All authors are from Salesforce AI Research but no funding sources are mentioned."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All five authors are listed as affiliated with 'Salesforce AI Research' on the first page."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Salesforce AI Research developed SFR-Judge (Wang et al., 2024a), one of the evaluated judge models. SFR-Judge-70B achieves the highest reranking and beam search performance among judges in the leaderboard (Fig. 1), and Salesforce has a commercial interest in demonstrating its judge model's effectiveness."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial disclosure statement appears in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for any of the judge models or generator models. The judges may have seen benchmark solutions (GSM8k, MATH, HumanEval all published 2021) during training."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether judge or generator models' training data overlaps with the benchmark datasets. Judges could have memorized correct solutions, affecting their ability to identify correct responses."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No contamination analysis despite using well-known public benchmarks (GSM8k from 2021, MATH from 2021, HumanEval from 2021) with models trained well after these benchmarks were published. The contamination risk for both generators and judges is not discussed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. All evaluation is automated using benchmark datasets and model-generated responses."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Computational complexity is discussed qualitatively (O(N²) for pairwise vs. O(N) for single-rating, Sec. 5), and hardware is mentioned (App. A.2), but no actual inference costs, latency measurements, or per-example timing are reported."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware specifications mentioned (1 A100 40GB for ≤12B models, 8xA100 40GB for ≥8x7B in App. A.2), but total GPU hours, wall-clock time, or total compute budget for the experiments are not stated."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No seed sensitivity analysis. Responses are pre-computed once to 'minimize the effect of randomness' (Sec. 3.1), but no analysis of how different random seeds would affect the sampled responses or resulting judge evaluations."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of samples per query (10) and beam search configuration (10,2) are stated, but the paper does not explicitly state how many times the full experiment was run. Pre-computed responses imply a single run, but this is not stated."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search budget reported. Temperature (1.0), top-p (0.95), and beam parameters (10,2) appear chosen without justification or search."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Sec. 3.5: 'If a judge supports multiple protocols (e.g., both single rating and pairwise comparisons in reranking), we report the maximum aggregate performance among all protocols.' The selection criterion is transparent and all raw data for all protocols is provided in the appendix tables."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Many t-tests are performed across 8 datasets, 10 judges, and up to 8 generators without any correction for multiple comparisons (no mention of Bonferroni, Holm, or Benjamini-Hochberg)."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "SFR-Judge is developed by Salesforce AI Research (the authors' affiliation). SFR-Judge-70B achieves the top leaderboard position among judges. No acknowledgment of potential bias from evaluating their own model alongside others."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper discusses asymptotic complexity (O(N²) vs. O(N)) and notes beam search requires more compute, but does not plot or report performance as a function of actual compute budget. No matched-compute comparisons are provided."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Sec. 1 explicitly discusses construct validity: 'RewardBench forms pairs of responses from different generators, meaning judges can arrive at the right outcome via stylistic factors, like formatting. JETTS, however, requires judges to compare responses sampled from the same generator, limiting stylistic factors.' Fig. 2 compares JETTS vs. RewardBench performance."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. Judges are evaluated directly on their output quality without agentic scaffolding."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of temporal leakage. Benchmarks like GSM8k (2021), MATH (2021), and HumanEval (2021) predate all evaluated models' training. Judge models trained on data including these benchmark solutions could have an unfair advantage in identifying correct responses."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information to the judges. For example, formatting patterns in correct vs. incorrect responses could serve as features."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of potential overlap between judge/generator training data and benchmark test sets. Some judge models were likely trained on data containing benchmark problems and solutions."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention methods applied. No canary strings, membership inference, or decontamination analysis performed."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "LLM-judges are competitive with outcome reward models in response reranking but consistently worse than process reward models in beam search.",
    369       "evidence": "Fig. 1 leaderboard: best judges (SFR-70B at 0.171, SC-70B at 0.177) outperform best RM (0.113) in reranking but lag QPRM (0.195) in beam search. Fig. 9 shows QPRM-7B substantially outperforms all judges in math beam search despite being smaller.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Natural language critiques from judges are currently ineffective in guiding generators toward better responses via iterative refinement.",
    374       "evidence": "Fig. 11: No judge achieves effective improvement ratio δ(Eff) > 1.0 for any task category. Instruction following experiences up to 10% degradation. Sec. 4.4 shows the final selected response is often the seed (index 0), and δ(G)_rand ≈ 1 across datasets.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Pairwise comparison protocol outperforms single-instance rating protocols for judge reranking.",
    379       "evidence": "Fig. 3 shows pairwise protocol achieves higher normalized helpfulness than Likert or additive single-rating on most datasets. Sec. 4.2 notes this comes at O(N²) cost vs. O(N). Fig. 25 shows single-rating judges are over-lenient, rating many responses equally high.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Weak judges (8B) cannot reliably improve strong generators (70B+) in reasoning-intensive tasks like math and code.",
    384       "evidence": "Fig. 5: At judge/generator size ratio ~0.1, normalized helpfulness is negative on average for math. Linear regression coefficient for math is 0.16 (p<0.001), showing statistically significant relationship between size ratio and helpfulness. Code shows no improvement at any size ratio.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "No judge can reliably outperform majority-vote aggregation on math datasets.",
    389       "evidence": "Fig. 6: Only SFR-70B and SC-70B offer improvement over majority-vote on GSM8k, and only for weak generators. On MATH, only SFR-70B beats majority-vote for small-to-medium generators.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Judge-specific finetuning primarily boosts instruction-following evaluation abilities, sometimes at the cost of code evaluation.",
    394       "evidence": "Fig. 4: Vanilla Llama-3.1-8B performs at greedy level across tasks. Finetuned 7B/8B judges 'outperform it significantly on instruction-following and moderately on math, but performance degrades, sometimes significantly, on coding.' Domain-specific prompts do not mitigate this (Fig. 23).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "JETTS reveals a scale-dependent judging ability gap not visible in RewardBench.",
    399       "evidence": "Fig. 2: Small judges (8B) perform comparably to large judges (70B) on RewardBench but substantially worse on JETTS. Skywork-Critic-8B vs 70B differ by ~4% on RewardBench but the 8B yields 'substantively lower improvements over the greedy response' on JETTS.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Judge critiques over-focus on stylistic features rather than response correctness.",
    404       "evidence": "App. B.3 qualitative study of 100+ refinements: Fig. 27 shows judge giving high score to mathematically incorrect response for being 'easy to follow'. Fig. 28 shows judge criticizing correct response for having 'more steps than necessary', leading to unnecessary refinement.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": ["benchmark-eval"],
    409   "key_findings": "The JETTS benchmark reveals that LLM-judges are competitive with outcome reward models for response reranking (especially in instruction following) but consistently lag process reward models in step-level beam search. Natural language critiques, a unique advantage of judges over scalar reward models, are currently ineffective for guiding generator refinement — no evaluated judge achieves an effective improvement ratio above 1.0. The pairwise evaluation protocol substantially outperforms single-rating protocols but at O(N²) computational cost. Critically, weak judges cannot improve strong generators on reasoning tasks, and the scale gap between judges that JETTS reveals is masked by simpler benchmarks like RewardBench.",
    410   "red_flags": [
    411     {
    412       "flag": "Self-evaluation bias",
    413       "detail": "SFR-Judge is developed by Salesforce AI Research (the authors' affiliation). SFR-Judge-70B achieves the highest reranking and beam search scores among judges on the leaderboard (Fig. 1). This conflict is not acknowledged or discussed anywhere in the paper."
    414     },
    415     {
    416       "flag": "No contamination analysis",
    417       "detail": "All benchmarks (GSM8k, MATH, HumanEval, MBPP, published 2021) predate the training of all evaluated models. Judges that have memorized correct solutions during training could artificially outperform in reranking tasks. No contamination check is performed."
    418     },
    419     {
    420       "flag": "No multiple comparison correction",
    421       "detail": "Hundreds of t-tests are performed across 8 datasets × 10 judges × up to 8 generators. No family-wise error rate correction is applied, inflating the likelihood of false significant findings."
    422     },
    423     {
    424       "flag": "Single-run results",
    425       "detail": "Responses are pre-computed once and fixed. While this ensures consistency across judge evaluations, it means results depend entirely on one set of sampled responses. No analysis of sensitivity to different response samples is provided."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "RewardBench: Evaluating Reward Models for Language Modeling",
    431       "authors": ["Nathan Lambert", "Valentina Pyatkin", "Jacob Morrison"],
    432       "year": 2024,
    433       "arxiv_id": "2403.13787",
    434       "relevance": "Primary comparison benchmark for evaluating reward models and LLM-judges; JETTS positions itself against RewardBench's limitations."
    435     },
    436     {
    437       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
    438       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    439       "year": 2024,
    440       "arxiv_id": "2408.03314",
    441       "relevance": "Key prior work on test-time compute scaling with fixed reward models; JETTS extends this by systematically evaluating LLM-judges in the same settings."
    442     },
    443     {
    444       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    445       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    446       "year": 2023,
    447       "relevance": "Foundational work on LLM-as-judge paradigm and MT-Bench evaluation framework."
    448     },
    449     {
    450       "title": "Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models",
    451       "authors": ["Seungone Kim", "Juyoung Suk", "Shayne Longpre"],
    452       "year": 2024,
    453       "arxiv_id": "2405.01535",
    454       "relevance": "One of the primary evaluated judge models; represents open-source judge model development with flexible evaluation criteria."
    455     },
    456     {
    457       "title": "Self-Taught Evaluators",
    458       "authors": ["Tianlu Wang", "Ilia Kulikov", "Olga Golovneva"],
    459       "year": 2024,
    460       "arxiv_id": "2408.02666",
    461       "relevance": "Evaluated judge model using iterative DPO self-teaching for pairwise evaluation; represents alternative training methodology for judges."
    462     },
    463     {
    464       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    465       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"],
    466       "year": 2024,
    467       "relevance": "Key prior work on using natural language critiques for iterative response refinement in agentic settings."
    468     },
    469     {
    470       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    471       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    472       "year": 2023,
    473       "relevance": "Developed HumanEval+ and MBPP+ benchmarks with improved testing rigor, used as code generation evaluation datasets in JETTS."
    474     },
    475     {
    476       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    477       "authors": ["Terry Yue Zhuo"],
    478       "year": 2024,
    479       "arxiv_id": "2406.15877",
    480       "relevance": "Code generation benchmark with diverse function calls used in JETTS evaluation; represents more complex code generation assessment."
    481     },
    482     {
    483       "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    484       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich"],
    485       "year": 2024,
    486       "arxiv_id": "2407.21787",
    487       "relevance": "Studies scaling test-time compute through repeated sampling for code and math tasks, complementary to JETTS's judge-focused evaluation."
    488     },
    489     {
    490       "title": "Let's Verify Step by Step",
    491       "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yuri Burda"],
    492       "year": 2023,
    493       "arxiv_id": "2305.20050",
    494       "relevance": "Foundational work on process reward models for step-level verification in math reasoning; PRMs shown to outperform judges in JETTS beam search."
    495     },
    496     {
    497       "title": "JudgeBench: A Benchmark for Evaluating LLM-Based Judges",
    498       "authors": ["Sijun Tan", "Siyuan Zhuang", "Kyle Montgomery"],
    499       "year": 2024,
    500       "arxiv_id": "2410.12784",
    501       "relevance": "Identifies shortcomings in RewardBench's reasoning samples and proposes harder pairwise evaluation for judges; complementary judge evaluation benchmark."
    502     },
    503     {
    504       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    505       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    506       "year": 2024,
    507       "relevance": "Key prior work on self-refinement using model feedback; JETTS tests whether external judge critiques can achieve what self-refinement promises."
    508     }
    509   ]
    510 }

Impressum · Datenschutz