scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21910B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Evaluating Judges as Evaluators: The JETTS Benchmark of LLM-as-Judges as Test-Time Scaling Evaluators",
      6     "authors": [
      7       "Yilun Zhou",
      8       "Austin Xu",
      9       "Peifeng Wang",
     10       "Caiming Xiong",
     11       "Shafiq Joty"
     12     ],
     13     "year": 2025,
     14     "venue": "International Conference on Machine Learning",
     15     "arxiv_id": "2504.15253",
     16     "doi": "10.48550/arXiv.2504.15253"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All three abstract claims (judges competitive with ORMs in reranking, worse than PRMs in beam search, critiques ineffective) are directly supported by experimental results in Sections 4.2–4.4 and the leaderboard in Figure 1.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims judge-specific finetuning 'seems to primarily boost instruction-following abilities, sometimes at the cost of other capabilities' (Sec 4.2) based on observational regression, not controlled training experiments. The study design is evaluative, not causal.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are scoped to 'current crop of judge models' and three tested domains. The conclusion frames findings as limitations of current judges, not universal statements about all possible LLM-judges.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider alternatives for major findings. For example, poor critique performance could stem from the generator's inability to act on critiques rather than critique quality itself — this distinction is not explored.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Normalized helpfulness is defined as a function of actual task performance (accuracy, pass@1, win rate). Measurement granularity directly matches the claims made.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section. Section 5 is 'Conclusion and Future Work' and the Impact Statement is a single generic sentence claiming no societal consequences need highlighting.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The one specific threat identified (oracle over-estimation in beam search due to answer vs. solution accuracy) appears in Appendix B.2, not as a systematic validity analysis. No threats regarding benchmark dataset selection or judge sampling are discussed.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what results do not generalize to. Findings are presented without explicit scope boundaries such as model size ranges or task types where conclusions would not hold.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment section is present. All authors are from Salesforce AI Research but no independent funding source is mentioned.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors are clearly identified as being from Salesforce AI Research in the paper header and correspondence information.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "All authors are Salesforce employees and SFR-Judge (a Salesforce-developed model family with 8B/12B/70B variants) is one of the primary models benchmarked. The authors evaluate their own product without disclosing this conflict.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement is present. The Impact Statement does not address financial interests, patents, equity, or consulting relationships.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "LLM-judges are defined as 'models trained to generate evaluations and critiques in natural language'; test-time scaling is defined; normalized helpfulness is formally defined with equations; all three benchmark tasks are precisely specified.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly claims to propose 'the first systematic benchmark of LLM-judges for model's test-time scaling' in the introduction, with clear description of what JETTS adds over existing judge benchmarks.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 systematically compares JETTS against RewardBench, ProcessBench, PPE, JudgeBench, and critique evaluation benchmarks, explaining how each differs and what gap JETTS fills.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues that RewardBench uses responses from different generators allowing judges to exploit stylistic differences, while JETTS uses responses from the same generator. Figure 2 empirically demonstrates that RewardBench performance diverges from JETTS for smaller models, providing evidence that JETTS measures a more fundamental capability.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No difficulty distribution analysis is provided. Datasets range from easy (GSM8k) to hard (MATH Level 5) but there is no systematic characterization of item difficulty distribution within the benchmark.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No explicit ceiling or floor effect analysis is conducted. The normalized helpfulness metric partially compensates, but the paper does not analyze whether certain datasets fail to discriminate between judges.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human baseline is included. Comparisons are made against greedy decoding, random selection, oracle (best-of-N), majority vote, and reward models only.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "The normalized helpfulness metric (h = (p_judge - p_greedy) / (p_oracle - p_greedy)) is formally defined with explicit justification for each component. The effective improvement ratio for refinement is similarly justified to ensure it beats both greedy and reranking baselines simultaneously.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The benchmark uses pre-existing datasets (GSM8k, MATH, HumanEval+, etc.) with no contamination resistance built in. No canary strings, temporal splits, or dynamic generation mechanisms are employed.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper does not discuss whether JETTS will remain discriminative as models improve, whether the chosen judge and generator models will become obsolete, or any plans for benchmark updates.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The paper identifies that oracle accuracy in beam search is a severe over-estimate due to final-answer vs. solution-validity divergence (App B.2), and discusses how GPT-4o stochasticity in CHAMP evaluation is mitigated by pre-computing all response evaluations.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Full code is released at https://github.com/SalesforceAIResearch/jetts-benchmark, all prompts are provided in the appendix (Figs 15-19), pre-computed model responses are released, and evaluation protocols are described in detail in Appendix A.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Table 1 documents all 8 datasets with sizes and evaluation metrics. Appendix A.1 provides detailed evaluation procedures for each dataset. The paper relies on established datasets with existing documentation and thoroughly describes all benchmark-specific components.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "A GitHub link is provided but no explicit license for JETTS is stated in the paper. The licensing terms under which others can use and extend the benchmark are not discussed.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "While a 'Practitioner note' recommends using reranking as a proxy for beam search, there is no explicit statement of what should NOT be concluded from JETTS results or what use cases the benchmark is not designed for.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "LLM-judges are competitive with outcome reward models in response reranking",
    203       "evidence": "Figure 1 leaderboard shows top judges (SFR 70B: 0.171, SC 70B: 0.177) comparable to Best RM (0.113) in normalized helpfulness",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "LLM-judges consistently underperform process reward models in beam search",
    208       "evidence": "Figure 9 shows QPRM 7B achieves higher normalized helpfulness than all LLM-judges in beam search for math, including 70B judges",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Natural language critiques from LLM-judges are ineffective for guiding generator refinement",
    213       "evidence": "Figure 11 shows all 6 evaluated judges achieve effective improvement ratio below 1.0 across all task categories, indicating critique-based refinement underperforms both greedy and reranking baselines",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "Small judges cannot provide weak-to-strong guidance for large generators",
    218       "evidence": "Figure 5 regression shows negative normalized helpfulness at judge/generator size ratio ~0.1 for math tasks; 8B judge for 70B generator yields negative helpfulness on average",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "RewardBench performance does not predict judge utility in test-time scaling settings",
    223       "evidence": "Figure 2 shows small judges perform comparably to large judges on RewardBench but lag significantly on JETTS reranking and beam search, particularly the 8B vs 70B Skywork-Critic comparison",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "Judge effectiveness is domain-dependent: instruction following best, code generation worst",
    228       "evidence": "Figure 4 shows all judges demonstrate highest helpfulness for instruction following, mixed but mostly positive for math, and mostly negative for code generation — consistent across all judge models",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "Domain-specific prompting does not improve judge performance",
    233       "evidence": "Figure 23 shows domain-specific prompts for SFR-Judge-8B decrease performance on all benchmarks, though not statistically significantly (all p > 0.05)",
    234       "supported": "moderate"
    235     }
    236   ],
    237   "methodology_tags": [
    238     "benchmark-eval"
    239   ],
    240   "key_findings": "JETTS reveals that LLM-judges are competitive with outcome reward models for response reranking but are consistently inferior to process reward models in beam search, even when judges are much larger than PRMs. Natural language critiques — a purported key advantage of judges — are currently ineffective at guiding generators toward better responses, with all evaluated judges achieving sub-baseline effective improvement ratios. Judge effectiveness is highly domain-dependent: instruction following works best, math is mixed, and no evaluated judge reliably improves code generation. Existing judge benchmarks like RewardBench fail to predict real-world test-time scaling utility, particularly for distinguishing small from large judges.",
    241   "red_flags": [
    242     {
    243       "flag": "Self-serving benchmark design",
    244       "detail": "All authors are Salesforce employees and SFR-Judge (a Salesforce-developed judge family with 8B/12B/70B variants) is one of the primary evaluated models. No conflict-of-interest disclosure is made despite this obvious conflict."
    245     },
    246     {
    247       "flag": "Limited critique evaluation scope",
    248       "detail": "Only 3 of 10 evaluated judges support critique generation, significantly limiting generalizability of the refinement findings. The conclusion that 'all are incapable at this task' rests on a small, potentially unrepresentative sample."
    249     },
    250     {
    251       "flag": "No dedicated limitations section",
    252       "detail": "The paper has no limitations or threats-to-validity section. The Impact Statement is a single generic sentence claiming no societal consequences need highlighting, inadequate for an ICML paper evaluating AI systems."
    253     },
    254     {
    255       "flag": "Oracle over-estimation buried in appendix",
    256       "detail": "The paper acknowledges in Appendix B.2 that oracle accuracy in beam search is 'likely a (severe) over-estimate' because it is based on final answer correctness, not solution validity. This materially affects normalized helpfulness interpretation but is not discussed in the main text."
    257     },
    258     {
    259       "flag": "No inter-judge statistical comparisons",
    260       "detail": "Statistical significance tests compare individual judges against baseline (0) but no pairwise comparisons between judges are reported. Claims about relative judge rankings lack formal statistical backing."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "RewardBench: Evaluating Reward Models for Language Modeling",
    266       "relevance": "Primary comparison baseline; JETTS is motivated by showing RewardBench inadequately predicts test-time scaling performance"
    267     },
    268     {
    269       "title": "ProcessBench: Identifying Process Errors in Mathematical Reasoning",
    270       "relevance": "Complementary benchmark for evaluating process reward models; directly relevant to JETTS beam search evaluation"
    271     },
    272     {
    273       "title": "How to Evaluate Reward Models for RLHF (PPE)",
    274       "relevance": "Related work evaluating reward model efficacy in best-of-N settings; contrasted with JETTS's multi-task approach"
    275     },
    276     {
    277       "title": "JudgeBench: A Benchmark for Evaluating LLM-based Judges",
    278       "relevance": "Direct predecessor judge benchmark; JETTS extends beyond fixed pairwise test sets to simulate actual test-time compute scenarios"
    279     },
    280     {
    281       "title": "Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models",
    282       "relevance": "Key evaluated judge model; one of the primary judges across all three JETTS tasks"
    283     },
    284     {
    285       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
    286       "relevance": "Core motivation paper establishing test-time compute paradigm with scalar reward models that JETTS extends to LLM-judges"
    287     },
    288     {
    289       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    290       "relevance": "Motivates the critique-based refinement task; one of the original papers arguing for natural language feedback loops in agentic settings"
    291     },
    292     {
    293       "title": "Direct Judgement Preference Optimization (SFR-Judge)",
    294       "relevance": "Describes the SFR-Judge models (Salesforce's judges) that are primary benchmarked models in JETTS"
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 3,
    300       "justification": "Directly answers 'should I use LLM judges or reward models for test-time scaling?' with concrete domain-specific guidance and a practitioner note on using reranking as a proxy metric."
    301     },
    302     "surprise_contrarian": {
    303       "score": 2,
    304       "justification": "The finding that natural language critiques — a touted advantage of judges — are currently useless for refinement challenges community assumptions; the weak-to-strong failure is also non-obvious."
    305     },
    306     "fear_safety": {
    307       "score": 0,
    308       "justification": "The paper has no safety or risk implications; it is purely an evaluation methodology paper for LLM judges in test-time compute settings."
    309     },
    310     "drama_conflict": {
    311       "score": 1,
    312       "justification": "Mild conflict: Salesforce employees benchmark their own SFR-Judge against competitors without disclosure; the finding that RewardBench misleads practitioners may generate some community debate."
    313     },
    314     "demo_ability": {
    315       "score": 3,
    316       "justification": "Full code released on GitHub with pre-computed responses, allowing practitioners to immediately evaluate new judge models through the JETTS pipeline."
    317     },
    318     "brand_recognition": {
    319       "score": 2,
    320       "justification": "Salesforce AI Research is a recognized lab, ICML 2025 venue, and the evaluated models include well-known systems from multiple organizations."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [
    325       {
    326         "hn_id": "40160728",
    327         "title": "CatLIP: Clip Vision Accuracy with 2.7x Faster Pre-Training on Web-Scale Data",
    328         "points": 48,
    329         "comments": 4,
    330         "url": "https://news.ycombinator.com/item?id=40160728",
    331         "created_at": "2024-04-25T17:46:04Z"
    332       },
    333       {
    334         "hn_id": "43686458",
    335         "title": "NPB-Rust: NAS Parallel Benchmarks in Rust",
    336         "points": 6,
    337         "comments": 1,
    338         "url": "https://news.ycombinator.com/item?id=43686458",
    339         "created_at": "2025-04-14T21:21:43Z"
    340       },
    341       {
    342         "hn_id": "41517885",
    343         "title": "Towards Large Language Models as Copilots for Theorem Proving in Lean",
    344         "points": 3,
    345         "comments": 0,
    346         "url": "https://news.ycombinator.com/item?id=41517885",
    347         "created_at": "2024-09-12T05:34:47Z"
    348       },
    349       {
    350         "hn_id": "40086186",
    351         "title": "Toward Self-Improvement of LLMs via Imagination, Searching, and Criticizing",
    352         "points": 3,
    353         "comments": 0,
    354         "url": "https://news.ycombinator.com/item?id=40086186",
    355         "created_at": "2024-04-19T12:51:23Z"
    356       },
    357       {
    358         "hn_id": "43781749",
    359         "title": "A Comprehensive Benchmark for C-to-Safe-Rust Transpilation",
    360         "points": 2,
    361         "comments": 0,
    362         "url": "https://news.ycombinator.com/item?id=43781749",
    363         "created_at": "2025-04-24T12:08:53Z"
    364       },
    365       {
    366         "hn_id": "44327775",
    367         "title": "Approximating Language Model Training Data from Weights",
    368         "points": 2,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=44327775",
    371         "created_at": "2025-06-20T13:56:11Z"
    372       },
    373       {
    374         "hn_id": "44086818",
    375         "title": "Gen2seg: Generative Models Enable Generalizable Instance Segmentation",
    376         "points": 2,
    377         "comments": 0,
    378         "url": "https://news.ycombinator.com/item?id=44086818",
    379         "created_at": "2025-05-25T10:20:25Z"
    380       },
    381       {
    382         "hn_id": "40139677",
    383         "title": "Toward Self-Improvement of LLMs via Imagination, Searching, and Criticizing",
    384         "points": 2,
    385         "comments": 0,
    386         "url": "https://news.ycombinator.com/item?id=40139677",
    387         "created_at": "2024-04-24T02:10:20Z"
    388       },
    389       {
    390         "hn_id": "40116933",
    391         "title": "Toward Self-Improvement of LLMs via Imagination, Searching, and Criticizing",
    392         "points": 2,
    393         "comments": 0,
    394         "url": "https://news.ycombinator.com/item?id=40116933",
    395         "created_at": "2024-04-22T18:02:16Z"
    396       },
    397       {
    398         "hn_id": "45349444",
    399         "title": "Seeing Is Deceiving:Mirror-Based Lidar Spoofing for Autonomous Vehicle Deception",
    400         "points": 1,
    401         "comments": 0,
    402         "url": "https://news.ycombinator.com/item?id=45349444",
    403         "created_at": "2025-09-23T16:39:48Z"
    404       }
    405     ],
    406     "top_points": 48,
    407     "total_points": 71,
    408     "total_comments": 5
    409   }
    410 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs