scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32508B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Foundational Automatic Evaluators: Scaling Multi-Task Generative Evaluator Training for Reasoning-Centric Domains",
      6     "authors": [
      7       "Austin Xu",
      8       "Xuan-Phi Nguyen",
      9       "Yilun Zhou",
     10       "Chien-Sheng Wu",
     11       "Caiming Xiong"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2510.17793",
     16     "doi": "10.48550/arXiv.2510.17793"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims are supported: 'FARE-8B challenges larger specialized RL-trained evaluators' (Tables 1-2), 'FARE-20B surpassing specialized 70B+ evaluators' (Tables 1-2), 'near-oracle performance on MATH' (Fig. 3), '14.1% vs string-matching' (Fig. 4), 'outperforms gpt-oss-20B by 65%' (Fig. 5).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims are mostly justified through controlled ablations (Table 6) that isolate individual factors. The GRPO experiment (Fig. 4) controls for training setup, varying only the verifier. Ablation design uses single-variable manipulation.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title says 'Foundational Automatic Evaluators' and the paper frames FARE as general-purpose, but evaluation is concentrated on math/code/reasoning. The 'multi-domain' claim is not well-bounded — chat/safety evaluation is minimal (Table 8 shows only moderate single-rating performance).",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No substantive discussion of alternative explanations. The gains could be partially due to data contamination, base model selection, or scale effects rather than the RS-SFT recipe, but these are not explored.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures benchmark accuracy as a proxy for 'evaluation quality' but does not discuss this gap. Benchmark performance may not reflect real-world evaluation utility. The JETTS reranking experiment partially addresses this but the distinction is not explicitly acknowledged.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations section in the paper. The conclusion (§5) is brief and does not discuss limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed. The paper does not address potential issues like benchmark overfitting, selection bias in benchmark choice, or generalization concerns.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit scope boundaries are stated. The paper does not specify what settings or domains FARE has NOT been tested on or where it might fail.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding statement or acknowledgments section mentions funding sources. All authors are from Salesforce AI Research but no funding disclosure is provided.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are listed as 'Salesforce AI Research' with email addresses. The affiliation is clearly stated on the first page.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Salesforce funds this research and has a commercial interest in demonstrating strong AI evaluation capabilities. The funder is not independent of the outcome.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is provided. Salesforce employees evaluating Salesforce-trained models is an undeclared conflict.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 2 provides a formal mathematical definition of 'automatic evaluator' (πθ: X→Y) and precisely defines all five evaluation tasks (pairwise, step-level, reference-based/free verification, single rating) with input/output structure.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three contributions are explicitly bullet-pointed in Section 1: (1) a 2.5M multi-task training dataset, (2) iterative RS-SFT recipe, and (3) the FARE-8B and FARE-20B model family.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 systematically positions FARE against prompted evaluators, SFT/DPO-based foundational evaluators, RL-trained models, and inference-time scaling approaches, explaining how RS-SFT bridges these paradigms.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No repository URL or code release link is provided in the paper. The paper mentions using a 'modified version of the OpenRLHF framework' but does not release their modifications or training code.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper describes curating 2.5M training samples from various sources but does not release the curated dataset. Individual source datasets are public, but the assembled training mix is not released.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No environment specifications, requirements files, or dependency lists are provided. The paper mentions using OpenRLHF but does not specify versions or dependencies.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided. While training hyperparameters are listed in §B.2, there are no runnable scripts or commands.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results tables (Tables 1-3, 4, 7-10) report only point estimates with no confidence intervals, error bars, or ± notation.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper makes many claims of the form 'FARE-8B outperforms X by Y points' (e.g., §4.1) based solely on comparing numbers without any statistical significance tests.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper consistently reports absolute point differences with baselines, e.g., 'outperforming RM-R1-14B by 6.57 absolute points on JudgeBench' (§4.1), 'a nearly 11 point absolute gain' (§4.2), providing baseline context.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification is given for the choice of 2.5M training samples, the number of benchmarks, or benchmark sizes. No power analysis is discussed.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread measures are reported for any experiments. Results appear to be single-run numbers. The K=4 rollout sampling is part of training, not evaluation variance.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Extensive baselines are included across all benchmarks: Tables 1-3 compare against 10+ baselines including RISE-Judge, EvalPlanner, J1, RM-R1, CompassJudger, Atla Selene, and frontier models.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines are contemporary, including recently released models like J1, RM-R1, StepWiser, GPT-5, and gpt-oss-120B (all 2024-2025 releases).",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "§D.1 (Table 6) ablates three components: proportion of direct judgment data (30-70%), continuous curriculum, and CoT handling for gpt-oss. §D.2 ablates critique vs. direct judgment prompting.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are used: consistent accuracy for pairwise, F1 for ProcessBench, accuracy for VerifyBench, Pearson correlation for single-rating, and downstream task performance for JETTS/GRPO experiments.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of FARE's outputs is included. All evaluation is automated via benchmark scoring. For an evaluator paper, human assessment of evaluation quality would be relevant.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "All evaluation benchmarks (JudgeBench, ProcessBench, VerifyBench, etc.) are external held-out test sets not used during training. The paper mentions N-gram decontamination (§B.1).",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Per-category breakdowns are provided throughout: ProcessBench shows GSM8K/MATH/OlympiadBench/OmniMATH splits (Table 2), JETTS shows per-benchmark results (Table 10), GRPO shows per-benchmark downstream results (Fig. 4).",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "§D.2 discusses failure cases where direct judgment prompting degrades FARE-20B performance. Table 4 notes MBPP+ degradation with SC@32. Table 10 shows cases where FARE-8B fails to improve over baseline.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Several negative results: FARE-20B performance degrades without critique/CoT (Table 7), SC@32 hurts MBPP+ (Table 4), FARE-8B struggles with larger generators on hard benchmarks (§D.6), and Qwen3-8B post-trained is noted as 'over-trained' and difficult to finetune.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Base models are identified as 'Qwen3-8B-Base' and 'gpt-oss-20B' but without version/snapshot dates. Baseline models like 'GPT-4o', 'GPT-5' are used without specific API versions. Generator models for synthetic data list model families but not exact versions.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full evaluation prompts for pairwise, step-level, reference-based verification, and direct judgment are provided in §E.1 with complete text including system and user prompt templates.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "§B.2 reports batch size 128, learning rate 1e-6, rollout batch sizes 50K/250K, K=4 rollouts at temperature 0.9. §C.2 reports GRPO training hyperparameters (rollout batch 1024, group size 8, temp 1.0, KL 0.001, LR 5e-7).",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. FARE models are standard generative evaluators without tool use, memory, or multi-step reasoning scaffolds.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "§3.1 describes data curation in detail: existing data sources with hand-crafted rubrics, synthetic data via programmatic error injection and generate-then-grade. Table 5 enumerates all data sources. §B.1 mentions N-gram decontamination.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw training data, model outputs, and evaluation predictions are not released. Only aggregate benchmark numbers are reported.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "§3.1 describes data collection in detail: existing data sources are enumerated in Table 5, synthetic data generation uses 12 generators with specific sampling strategies, and rubric creation process is described with an example in §E.2.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data comes from existing datasets and synthetic generation.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The data pipeline is documented: Table 5 lists all sources, Fig. 2 shows breakdowns by task/domain/approach. §3.1 describes the existing data → synthetic supplement → final mix pipeline. Decontamination is mentioned in §B.1.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff date is stated for the base models (Qwen3-8B-Base, gpt-oss-20B) or for the FARE training data collection period.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "§B.1 states 'We took efforts to decontaminate our training sets with N-gram matching approaches, following Guha et al. (2025).' This addresses potential overlap between training and evaluation data.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "While N-gram decontamination is mentioned for FARE's own training data, no analysis is provided for whether the base models (Qwen3, gpt-oss) may have seen the evaluation benchmarks during their pre-training.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or tokens-per-evaluation is reported despite the paper emphasizing efficiency as a design goal (§2: 'we seek efficiency'). The SC@32 experiments multiply inference cost 32x without cost discussion.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total compute budget is stated. Training uses 2.5M samples with iterative RS-SFT but GPU hours, training time, and total API spend for synthetic data generation are not reported.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No multi-seed results are reported. All benchmark evaluations appear to be single-run. Training uses a single random seed without sensitivity analysis.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of evaluation runs is not stated. For pairwise benchmarks with consistent accuracy, each sample is run twice (swapping order), but no repeated runs for variance estimation.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search budget is reported. The ablation study (Table 6) explores a few configurations but does not report total search budget or how the final hyperparameters were selected.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "Table 6 shows ablation results across configurations and the paper explains selection rationale: 'we choose 40% and 60% to train FARE-8B and FARE-20B' based on peak average performance across pairwise and ProcessBench.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "No statistical tests are performed at all, so no multiple comparison correction. The paper compares across many benchmarks and baselines without any correction.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors evaluate their own FARE models against baselines without acknowledging self-comparison bias. Baselines use officially reported numbers or are run with their own templates (§C.1), but the asymmetry in effort is not discussed.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "FARE is trained on 2.5M samples with iterative RS-SFT while baselines may use far less data/compute. No compute-matched comparison is provided. FARE-20B has 3.6B active parameters vs. 70B+ baselines, but training compute is not compared.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "No discussion of whether the 7 benchmarks actually measure meaningful evaluation quality. The paper does not question construct validity of any benchmark.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is involved in FARE evaluation. Models are prompted directly without agentic frameworks.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of temporal leakage. The base models (Qwen3, gpt-oss) may have been trained on data containing benchmark problems, and training data includes datasets from various time periods without temporal analysis.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the evaluation setup leaks information. For example, the reference-based verification task provides a reference answer — no analysis of whether this creates an unfair advantage over baselines that don't use references.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether training and evaluation data share structural similarities. Training data uses some of the same seed datasets (MATH, MBPP) that appear in evaluation benchmarks, and while N-gram decontamination is applied, distributional overlap is not analyzed.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": true,
    448           "justification": "§B.1 states 'We took efforts to decontaminate our training sets with N-gram matching approaches, following Guha et al. (2025).' This is a concrete leakage prevention method.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "FARE-20B sets a new standard for open-source evaluators, surpassing specialized 70B+ models with 3.5x fewer total parameters",
    457       "evidence": "Tables 1–3 show FARE-20B outperforming EvalPlanner-70B (64.29 vs 56.60 on JudgeBench), J1-70B on most pairwise benchmarks, and Qwen2.5-Math-72B-PRM on ProcessBench (84.4 vs 78.3)",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "FARE-20B achieves near-oracle performance on MATH as an inference-time best-of-10 reranker",
    462       "evidence": "Figure 3 shows FARE-20B approaching the oracle performance line for MATH across multiple generators, beating SFR-Judge-70B and Skywork-Critic-70B by 14 and 21 absolute points respectively",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Using FARE-20B as a verifier in GRPO training improves downstream model performance by 14.1% relative to string-matching verifiers",
    467       "evidence": "Figure 4 shows overall accuracy improving from 34.3 (string matching) to 45.2 (FARE-20B) for Qwen2.5-7B-Base; this is a single experimental run without variance reporting",
    468       "supported": "moderate"
    469     },
    470     {
    471       "claim": "Data scaling with simple iterative RS-SFT produces evaluators competitive with RL-trained models",
    472       "evidence": "FARE-8B (SFT-trained) outperforms RL-trained RM-R1-14B on JudgeBench (55.71 vs 46.86); ablations in Table 6 validate individual training recipe components",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "Continual finetuning of FARE-20B on only 15K code samples yields FARE-20B-Code that outperforms gpt-oss-120B on test-case quality evaluation",
    477       "evidence": "Figure 5 shows FARE-20B-Code scoring 56.55 vs gpt-oss-120B at 32.02 on test-case quality; a 76% absolute improvement from a small domain-specific fine-tuning set",
    478       "supported": "moderate"
    479     },
    480     {
    481       "claim": "Positional bias robustness emerges as a function of training data scale rather than requiring explicit algorithmic mitigation",
    482       "evidence": "Figure 6 shows pairwise consistency scores monotonically increasing with number of training samples for both FARE-8B and an earlier Qwen2.5-7B run",
    483       "supported": "moderate"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "FARE-20B, a 20B-parameter (3.6B active) evaluator trained with iterative rejection-sampling SFT on 2.5M multi-task samples, achieves state-of-the-art performance among open-source evaluators, surpassing specialized 70B+ models on seven benchmarks. The central finding is that large-scale data curation with a simple semi-online training approach (RS-SFT) is competitive with RL-based evaluator training methods, which typically train smaller models on narrow tasks at higher computational cost. FARE shows strong downstream utility as inference-time reranker (near-oracle on MATH best-of-10), GRPO training verifier (14.1% relative improvement over string-matching), and domain-specific finetuning initialization (surpassing gpt-oss-120B on code test-case quality after only 15K samples). A secondary finding is that positional bias robustness emerges emergently with data scale rather than requiring explicit data augmentation or algorithmic interventions.",
    490   "red_flags": [
    491     {
    492       "flag": "No statistical significance testing",
    493       "detail": "All benchmark comparisons are single-run point estimates with no variance, confidence intervals, or significance tests; small performance gaps between FARE and baselines may not be meaningful."
    494     },
    495     {
    496       "flag": "Training data not released",
    497       "detail": "The core contribution — 2.5M curated training samples — is not released, making reproduction impossible; only the public evaluation benchmarks are available."
    498     },
    499     {
    500       "flag": "Self-evaluation by employer",
    501       "detail": "All authors are Salesforce employees evaluating their own models against commercial competitors; no independent third-party reproduction or validation is provided."
    502     },
    503     {
    504       "flag": "Base model contamination unaddressed",
    505       "detail": "While FARE training data is N-gram decontaminated, the paper does not address whether Qwen3-8B-Base or gpt-oss-20B pretraining/post-training data overlaps with the seven evaluation benchmarks."
    506     },
    507     {
    508       "flag": "No limitations section",
    509       "detail": "The paper has no dedicated limitations or threats-to-validity section; failure modes, generalization limits, and alternative explanations are not systematically discussed."
    510     },
    511     {
    512       "flag": "Vague decontamination procedure",
    513       "detail": "N-gram decontamination is described only as 'N-gram matching approaches' without specifying n-gram size, similarity threshold, or the number/fraction of samples removed."
    514     }
    515   ],
    516   "cited_papers": [
    517     {
    518       "title": "Foundational Autoraters: Taming Large Language Models for Better Automatic Evaluation",
    519       "relevance": "Direct predecessor establishing the foundational evaluator paradigm at scale; FARE explicitly extends and surpasses this work"
    520     },
    521     {
    522       "title": "Direct Judgement Preference Optimization (SFR-Judge)",
    523       "relevance": "Key foundational evaluator baseline used in all main benchmark tables and JETTS downstream evaluation"
    524     },
    525     {
    526       "title": "Self-Taught Evaluators",
    527       "relevance": "Closely related iterative SFT training approach; FARE directly addresses its limitations (small data scale, pairwise-only tasks)"
    528     },
    529     {
    530       "title": "J1: Incentivizing Thinking in LLM-as-a-Judge via Reinforcement Learning",
    531       "relevance": "Primary RL-trained evaluator baseline that FARE challenges with simpler SFT; directly compared on all pairwise benchmarks"
    532     },
    533     {
    534       "title": "CompassJudger-1: All-in-One Judge Model Helps Model Evaluation and Evolution",
    535       "relevance": "Key foundational evaluator baseline at multiple scales (7B, 14B, 32B) for pairwise and JETTS comparisons"
    536     },
    537     {
    538       "title": "ProcessBench: Identifying Process Errors in Mathematical Reasoning",
    539       "relevance": "Core step-level evaluation benchmark used as one of seven main evaluation benchmarks"
    540     },
    541     {
    542       "title": "JudgeBench: A Benchmark for Evaluating LLM-Based Judges",
    543       "relevance": "Primary pairwise reasoning evaluation benchmark; central to FARE's state-of-the-art claims"
    544     },
    545     {
    546       "title": "Evaluating Judges as Evaluators: The JETTS Benchmark",
    547       "relevance": "Standardized framework for inference-time reranking evaluation used in Section 4.2 downstream experiments"
    548     },
    549     {
    550       "title": "General-Reasoner: Advancing LLM Reasoning Across All Domains",
    551       "relevance": "GRPO training baseline; provides WebInstruct-Verified dataset and string-matching/General-Verifier baselines that FARE improves upon"
    552     }
    553   ],
    554   "engagement_factors": {
    555     "practical_relevance": {
    556       "score": 3,
    557       "justification": "FARE models are directly usable as drop-in components for RL training verifiers, inference-time rerankers, and domain-specific code evaluators — all high-value practitioner pipeline needs."
    558     },
    559     "surprise_contrarian": {
    560       "score": 2,
    561       "justification": "Challenges the dominant trend toward RL-based evaluator training by showing that large-scale SFT with rejection sampling matches or beats expensive RL methods."
    562     },
    563     "fear_safety": {
    564       "score": 0,
    565       "justification": "No safety or risk concerns raised; this is a methodology paper about training evaluation models."
    566     },
    567     "drama_conflict": {
    568       "score": 1,
    569       "justification": "Mild competitive positioning against RL-based approaches ('computationally demanding with brittle training pipelines') but framed constructively."
    570     },
    571     "demo_ability": {
    572       "score": 2,
    573       "justification": "If models are released (implied but not confirmed), practitioners could immediately deploy them; all evaluation prompts are provided in full in Section E."
    574     },
    575     "brand_recognition": {
    576       "score": 2,
    577       "justification": "Salesforce AI Research is a recognized lab; comparisons against GPT-5 and gpt-oss-120B as reference points add credibility and name recognition."
    578     }
    579   },
    580   "hn_data": {
    581     "threads": [
    582       {
    583         "hn_id": "45657595",
    584         "title": "Binary Retrieval-Augmented Reward Mitigates Hallucinations",
    585         "points": 44,
    586         "comments": 3,
    587         "url": "https://news.ycombinator.com/item?id=45657595",
    588         "created_at": "2025-10-21T16:14:28Z"
    589       },
    590       {
    591         "hn_id": "42984225",
    592         "title": "Leveraging Multimodal LLM for Inspirational User Interface Search",
    593         "points": 2,
    594         "comments": 0,
    595         "url": "https://news.ycombinator.com/item?id=42984225",
    596         "created_at": "2025-02-08T16:52:28Z"
    597       },
    598       {
    599         "hn_id": "45876369",
    600         "title": "Diagnosing Representation Dynamics in NER Model Extension",
    601         "points": 1,
    602         "comments": 0,
    603         "url": "https://news.ycombinator.com/item?id=45876369",
    604         "created_at": "2025-11-10T14:30:09Z"
    605       }
    606     ],
    607     "top_points": 44,
    608     "total_points": 47,
    609     "total_comments": 3
    610   }
    611 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs