ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27871B)


      1 {
      2   "paper": {
      3     "title": "Foundational Automatic Evaluators: Scaling Multi-Task Generative Evaluator Training for Reasoning-Centric Domains",
      4     "authors": ["Austin Xu", "Xuan-Phi Nguyen", "Yilun Zhou", "Chien-Sheng Wu", "Caiming Xiong", "Shafiq Joty"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2510.17793",
      8     "doi": "10.48550/arXiv.2510.17793"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No repository URL or code release link is provided in the paper. The paper mentions using a 'modified version of the OpenRLHF framework' but does not release their modifications or training code."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The paper describes curating 2.5M training samples from various sources but does not release the curated dataset. Individual source datasets are public, but the assembled training mix is not released."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, requirements files, or dependency lists are provided. The paper mentions using OpenRLHF but does not specify versions or dependencies."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. While training hyperparameters are listed in §B.2, there are no runnable scripts or commands."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "All results tables (Tables 1-3, 4, 7-10) report only point estimates with no confidence intervals, error bars, or ± notation."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper makes many claims of the form 'FARE-8B outperforms X by Y points' (e.g., §4.1) based solely on comparing numbers without any statistical significance tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper consistently reports absolute point differences with baselines, e.g., 'outperforming RM-R1-14B by 6.57 absolute points on JudgeBench' (§4.1), 'a nearly 11 point absolute gain' (§4.2), providing baseline context."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification is given for the choice of 2.5M training samples, the number of benchmarks, or benchmark sizes. No power analysis is discussed."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance, standard deviation, or spread measures are reported for any experiments. Results appear to be single-run numbers. The K=4 rollout sampling is part of training, not evaluation variance."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Extensive baselines are included across all benchmarks: Tables 1-3 compare against 10+ baselines including RISE-Judge, EvalPlanner, J1, RM-R1, CompassJudger, Atla Selene, and frontier models."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines are contemporary, including recently released models like J1, RM-R1, StepWiser, GPT-5, and gpt-oss-120B (all 2024-2025 releases)."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "§D.1 (Table 6) ablates three components: proportion of direct judgment data (30-70%), continuous curriculum, and CoT handling for gpt-oss. §D.2 ablates critique vs. direct judgment prompting."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are used: consistent accuracy for pairwise, F1 for ProcessBench, accuracy for VerifyBench, Pearson correlation for single-rating, and downstream task performance for JETTS/GRPO experiments."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation of FARE's outputs is included. All evaluation is automated via benchmark scoring. For an evaluator paper, human assessment of evaluation quality would be relevant."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "All evaluation benchmarks (JudgeBench, ProcessBench, VerifyBench, etc.) are external held-out test sets not used during training. The paper mentions N-gram decontamination (§B.1)."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Per-category breakdowns are provided throughout: ProcessBench shows GSM8K/MATH/OlympiadBench/OmniMATH splits (Table 2), JETTS shows per-benchmark results (Table 10), GRPO shows per-benchmark downstream results (Fig. 4)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "§D.2 discusses failure cases where direct judgment prompting degrades FARE-20B performance. Table 4 notes MBPP+ degradation with SC@32. Table 10 shows cases where FARE-8B fails to improve over baseline."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative results: FARE-20B performance degrades without critique/CoT (Table 7), SC@32 hurts MBPP+ (Table 4), FARE-8B struggles with larger generators on hard benchmarks (§D.6), and Qwen3-8B post-trained is noted as 'over-trained' and difficult to finetune."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims are supported: 'FARE-8B challenges larger specialized RL-trained evaluators' (Tables 1-2), 'FARE-20B surpassing specialized 70B+ evaluators' (Tables 1-2), 'near-oracle performance on MATH' (Fig. 3), '14.1% vs string-matching' (Fig. 4), 'outperforms gpt-oss-20B by 65%' (Fig. 5)."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims are mostly justified through controlled ablations (Table 6) that isolate individual factors. The GRPO experiment (Fig. 4) controls for training setup, varying only the verifier. Ablation design uses single-variable manipulation."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title says 'Foundational Automatic Evaluators' and the paper frames FARE as general-purpose, but evaluation is concentrated on math/code/reasoning. The 'multi-domain' claim is not well-bounded — chat/safety evaluation is minimal (Table 8 shows only moderate single-rating performance)."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "No substantive discussion of alternative explanations. The gains could be partially due to data contamination, base model selection, or scale effects rather than the RS-SFT recipe, but these are not explored."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper measures benchmark accuracy as a proxy for 'evaluation quality' but does not discuss this gap. Benchmark performance may not reflect real-world evaluation utility. The JETTS reranking experiment partially addresses this but the distinction is not explicitly acknowledged."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "Base models are identified as 'Qwen3-8B-Base' and 'gpt-oss-20B' but without version/snapshot dates. Baseline models like 'GPT-4o', 'GPT-5' are used without specific API versions. Generator models for synthetic data list model families but not exact versions."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full evaluation prompts for pairwise, step-level, reference-based verification, and direct judgment are provided in §E.1 with complete text including system and user prompt templates."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "§B.2 reports batch size 128, learning rate 1e-6, rollout batch sizes 50K/250K, K=4 rollouts at temperature 0.9. §C.2 reports GRPO training hyperparameters (rollout batch 1024, group size 8, temp 1.0, KL 0.001, LR 5e-7)."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. FARE models are standard generative evaluators without tool use, memory, or multi-step reasoning scaffolds."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "§3.1 describes data curation in detail: existing data sources with hand-crafted rubrics, synthetic data via programmatic error injection and generate-then-grade. Table 5 enumerates all data sources. §B.1 mentions N-gram decontamination."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations section in the paper. The conclusion (§5) is brief and does not discuss limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed. The paper does not address potential issues like benchmark overfitting, selection bias in benchmark choice, or generalization concerns."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No explicit scope boundaries are stated. The paper does not specify what settings or domains FARE has NOT been tested on or where it might fail."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "Raw training data, model outputs, and evaluation predictions are not released. Only aggregate benchmark numbers are reported."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "§3.1 describes data collection in detail: existing data sources are enumerated in Table 5, synthetic data generation uses 12 generators with specific sampling strategies, and rubric creation process is described with an example in §E.2."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data comes from existing datasets and synthetic generation."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The data pipeline is documented: Table 5 lists all sources, Fig. 2 shows breakdowns by task/domain/approach. §3.1 describes the existing data → synthetic supplement → final mix pipeline. Decontamination is mentioned in §B.1."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding statement or acknowledgments section mentions funding sources. All authors are from Salesforce AI Research but no funding disclosure is provided."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All authors are listed as 'Salesforce AI Research' with email addresses. The affiliation is clearly stated on the first page."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Salesforce funds this research and has a commercial interest in demonstrating strong AI evaluation capabilities. The funder is not independent of the outcome."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is provided. Salesforce employees evaluating Salesforce-trained models is an undeclared conflict."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff date is stated for the base models (Qwen3-8B-Base, gpt-oss-20B) or for the FARE training data collection period."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "§B.1 states 'We took efforts to decontaminate our training sets with N-gram matching approaches, following Guha et al. (2025).' This addresses potential overlap between training and evaluation data."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "While N-gram decontamination is mentioned for FARE's own training data, no analysis is provided for whether the base models (Qwen3, gpt-oss) may have seen the evaluation benchmarks during their pre-training."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost, latency, or tokens-per-evaluation is reported despite the paper emphasizing efficiency as a design goal (§2: 'we seek efficiency'). The SC@32 experiments multiply inference cost 32x without cost discussion."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total compute budget is stated. Training uses 2.5M samples with iterative RS-SFT but GPU hours, training time, and total API spend for synthetic data generation are not reported."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No multi-seed results are reported. All benchmark evaluations appear to be single-run. Training uses a single random seed without sensitivity analysis."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of evaluation runs is not stated. For pairwise benchmarks with consistent accuracy, each sample is run twice (swapping order), but no repeated runs for variance estimation."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget is reported. The ablation study (Table 6) explores a few configurations but does not report total search budget or how the final hyperparameters were selected."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Table 6 shows ablation results across configurations and the paper explains selection rationale: 'we choose 40% and 60% to train FARE-8B and FARE-20B' based on peak average performance across pairwise and ProcessBench."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No statistical tests are performed at all, so no multiple comparison correction. The paper compares across many benchmarks and baselines without any correction."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors evaluate their own FARE models against baselines without acknowledging self-comparison bias. Baselines use officially reported numbers or are run with their own templates (§C.1), but the asymmetry in effort is not discussed."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "FARE is trained on 2.5M samples with iterative RS-SFT while baselines may use far less data/compute. No compute-matched comparison is provided. FARE-20B has 3.6B active parameters vs. 70B+ baselines, but training compute is not compared."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "No discussion of whether the 7 benchmarks actually measure meaningful evaluation quality. The paper does not question construct validity of any benchmark."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is involved in FARE evaluation. Models are prompted directly without agentic frameworks."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of temporal leakage. The base models (Qwen3, gpt-oss) may have been trained on data containing benchmark problems, and training data includes datasets from various time periods without temporal analysis."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information. For example, the reference-based verification task provides a reference answer — no analysis of whether this creates an unfair advantage over baselines that don't use references."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether training and evaluation data share structural similarities. Training data uses some of the same seed datasets (MATH, MBPP) that appear in evaluation benchmarks, and while N-gram decontamination is applied, distributional overlap is not analyzed."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "§B.1 states 'We took efforts to decontaminate our training sets with N-gram matching approaches, following Guha et al. (2025).' This is a concrete leakage prevention method."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "FARE-8B outperforms larger specialized RL-trained evaluators on pairwise benchmarks",
    363       "evidence": "Table 1: FARE-8B achieves 55.71 on JudgeBench vs. J1-8B (42.00), RM-R1-14B (46.86), and EvalPlanner-8B (30.20). Also outperforms CompassJudger-14B (50.29).",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "FARE-20B surpasses specialized 70B+ evaluators",
    368       "evidence": "Table 1: FARE-20B (64.29 JudgeBench) vs. EvalPlanner-70B (56.60), J1-70B (60.00). Table 2: FARE-20B (84.4 ProcessBench) vs. Qwen2.5-Math-72B-PRM (78.3).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "FARE-20B achieves near-oracle reranking performance on MATH",
    373       "evidence": "Fig. 3: FARE-20B reranking of Llama-3.1-8B on MATH approaches the oracle green line, with 50.83 vs. 53.47 oracle (Table 10).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "FARE improves downstream RL-trained model performance by up to 14.1% vs. string-matching verifiers",
    378       "evidence": "Fig. 4: String matching yields 39.6 average, FARE-20B yields 45.2, a relative improvement of (45.2-39.6)/39.6 = 14.1%.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Continually-finetuned FARE-Code outperforms gpt-oss-20B by 65% on test-case quality evaluation",
    383       "evidence": "Fig. 5: FARE-20B-Code achieves 56.55 on test case quality vs. gpt-oss-20B's 34.40, a relative improvement of ~64.4%.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Iterative rejection sampling SFT is a stable approach for training evaluators at scale",
    388       "evidence": "The paper trains on 2.5M samples and shows improving performance across iterations (Fig. 6 shows consistency improving with data scale). Ablation in Table 6 shows stable results across configurations.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Positional robustness emerges with data scale",
    393       "evidence": "Fig. 6 shows consistency steadily increasing from ~65% to ~80% over 2.5M training samples for both Qwen3-8B and Qwen2.5-7B models.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "methodology_tags": ["benchmark-eval"],
    398   "key_findings": "FARE, a family of 8B and 20B generative evaluators trained on 2.5M multi-task samples via iterative rejection-sampling SFT, achieves best-in-class performance across 7 evaluation benchmarks spanning pairwise, step-level, and reference-based verification tasks. FARE-20B surpasses specialized 70B+ evaluators despite having ~20x fewer active parameters. In downstream applications, FARE achieves near-oracle reranking on MATH, improves RL-trained model performance by 14.1% over string-matching verifiers, and can be continually finetuned for domain-specific evaluation with only 15K samples.",
    399   "red_flags": [
    400     {
    401       "flag": "Company evaluating its own product",
    402       "detail": "All authors are from Salesforce AI Research. FARE is initialized from gpt-oss-20B (a Salesforce model) and compared favorably against competitors. The paper also introduces gpt-oss-20B/120B as baselines that FARE is shown to improve upon, effectively promoting Salesforce models."
    403     },
    404     {
    405       "flag": "No variance or error bars across any results",
    406       "detail": "All results across 7 benchmarks and 3 downstream settings are single-point estimates. For a paper making strong comparative claims ('best-in-class', 'sets the new standard'), the absence of any uncertainty quantification is a significant concern."
    407     },
    408     {
    409       "flag": "No limitations section",
    410       "detail": "The paper has no limitations discussion whatsoever. For a paper claiming 'foundational' evaluators, failure to discuss where the approach might fail or what settings are out of scope is a notable omission."
    411     },
    412     {
    413       "flag": "No compute cost reported despite emphasizing efficiency",
    414       "detail": "The paper lists efficiency as a design goal (§2) but never reports training compute, inference latency, or API costs. The 2.5M sample training and 12-generator synthetic data pipeline represent substantial compute, and SC@32 experiments multiply inference cost 32x, all unreported."
    415     },
    416     {
    417       "flag": "Selective benchmark presentation",
    418       "detail": "FARE-20B underperforms gpt-oss-20B on some benchmarks when prompted without critique/CoT (Table 7: PPE drops from 74.4 to 68.9, RM-Bench from 90.5 to 85.5). The main results use the favorable prompting setup."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Foundational autoraters: Taming large language models for better automatic evaluation",
    424       "authors": ["Tu Vu", "Kalpesh Krishna", "Salaheddin Alzubi"],
    425       "year": 2024,
    426       "arxiv_id": "2407.10817",
    427       "relevance": "Prior work on foundational multi-task evaluator training that FARE builds upon."
    428     },
    429     {
    430       "title": "Self-taught evaluators",
    431       "authors": ["Tianlu Wang", "Ilia Kulikov", "Olga Golovneva"],
    432       "year": 2024,
    433       "arxiv_id": "2408.02666",
    434       "relevance": "Closely related iterative SFT approach for training evaluators, used as baseline."
    435     },
    436     {
    437       "title": "J1: Incentivizing thinking in LLM-as-a-judge via reinforcement learning",
    438       "authors": ["Chenxi Whitehouse", "Tianlu Wang"],
    439       "year": 2025,
    440       "arxiv_id": "2505.10320",
    441       "relevance": "RL-trained judge model used as primary baseline across pairwise benchmarks."
    442     },
    443     {
    444       "title": "RM-R1: Reward modeling as reasoning",
    445       "authors": ["Xiusi Chen", "Gaotang Li"],
    446       "year": 2025,
    447       "arxiv_id": "2505.02387",
    448       "relevance": "RL-trained reward model baseline for pairwise evaluation."
    449     },
    450     {
    451       "title": "ProcessBench: Identifying process errors in mathematical reasoning",
    452       "authors": ["Chujie Zheng", "Zhenru Zhang"],
    453       "year": 2024,
    454       "arxiv_id": "2412.06559",
    455       "relevance": "Key benchmark for step-level evaluation of mathematical reasoning."
    456     },
    457     {
    458       "title": "JudgeBench: A benchmark for evaluating LLM-based judges",
    459       "authors": ["Sijun Tan", "Siyuan Zhuang"],
    460       "year": 2024,
    461       "arxiv_id": "2410.12784",
    462       "relevance": "Primary pairwise benchmark for evaluating LLM judges in reasoning settings."
    463     },
    464     {
    465       "title": "Evaluating judges as evaluators: The JETTS benchmark",
    466       "authors": ["Yilun Zhou", "Austin Xu"],
    467       "year": 2025,
    468       "arxiv_id": "2504.15253",
    469       "relevance": "Benchmark for evaluating LLM judges as test-time scaling evaluators, used for downstream evaluation."
    470     },
    471     {
    472       "title": "Direct judgement preference optimization",
    473       "authors": ["Peifeng Wang", "Austin Xu"],
    474       "year": 2024,
    475       "arxiv_id": "2409.14664",
    476       "relevance": "Training approach for multi-task foundational evaluators at scale."
    477     },
    478     {
    479       "title": "General-reasoner: Advancing LLM reasoning across all domains",
    480       "authors": ["Xueguang Ma", "Qian Liu"],
    481       "year": 2025,
    482       "arxiv_id": "2505.14652",
    483       "relevance": "Provides the GRPO training setup and verifier baselines used in FARE's downstream evaluation."
    484     },
    485     {
    486       "title": "No free labels: Limitations of LLM-as-a-judge without human grounding",
    487       "authors": ["Michael Krumdick", "Charles Lovering"],
    488       "year": 2025,
    489       "arxiv_id": "2503.05061",
    490       "relevance": "Documents risks of LLM evaluators generating reference answers, motivating FARE's design choice to avoid this."
    491     },
    492     {
    493       "title": "Inference-time scaling for generalist reward modeling",
    494       "authors": ["Zijun Liu", "Peiyi Wang"],
    495       "year": 2025,
    496       "arxiv_id": "2504.02495",
    497       "relevance": "DeepSeek-GRM approach for inference-time scaling of evaluators, used as baseline in Table 4."
    498     },
    499     {
    500       "title": "StepWiser: Stepwise generative judges for wiser reasoning",
    501       "authors": ["Wei Xiong", "Wenting Zhao"],
    502       "year": 2025,
    503       "arxiv_id": "2508.19229",
    504       "relevance": "RL-trained specialized step-level evaluator used as baseline on ProcessBench."
    505     }
    506   ]
    507 }

Impressum · Datenschutz