scan.json (25445B)
1 { 2 "paper": { 3 "title": "ResearchRubrics: A Benchmark of Prompts and Rubrics For Evaluating Deep Research Agents", 4 "authors": ["Manasi Sharma", "Chen Bo Calvin Zhang", "Chaithanya Bandi", "Clinton Wang", "Ankit Aich", "Huy Nghiem", "Tahseen Rabbani", "Ye Htet", "Brian Jang", "Sumana Basu", "Aishwarya Balwani", "Denis Peskoff", "Marcos Ayestaran", "Sean M. Hendryx", "Brad Kenstler", "Bing Liu"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2511.07685", 8 "doi": "10.48550/arXiv.2511.07685" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "RESEARCHRUBRICS introduces 101 prompts with 2,593 expert-written rubric criteria for evaluating Deep Research agents across 9 domains. Even leading agents (Gemini DR, OpenAI DR) achieve under 68% rubric compliance. Implicit reasoning and synthesis account for 45-50% of all failures. LLM-based rubric augmentation catastrophically degrades human-LLM alignment by 15-20%, while adding concrete examples improves it by 3-4%.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The abstract states 'we release RESEARCHRUBRICS (including all prompts, rubrics, and evaluation code)' and provides a URL: https://scale.com/research/researchrubrics." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The benchmark includes all prompts and rubrics as released data. The abstract confirms release of all prompts and rubrics." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, requirements.txt, or dependency information is provided in the paper." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The evaluation prompts are shown (Figs. 19-21) but there are no instructions for replicating the full evaluation pipeline." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Main results in Table 5 report point estimates (e.g., 0.677, 0.664) with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims Gemini DR outperforms OpenAI DR and Perplexity DR but provides no statistical significance tests for these comparisons." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports absolute compliance scores and percentage breakdowns (e.g., 67.7% ternary, 61.5% binary) with baseline context, and failure rate breakdowns by category. Correlation coefficients with p-values are reported for length-quality analysis (e.g., r=0.280, p=0.005)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The benchmark has 101 prompts but no justification is given for why this number is sufficient. No power analysis or sample size discussion." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported for the main compliance scores. Table 9 reports SD for response lengths but not for evaluation scores." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Three commercial DR systems are compared against each other (Gemini DR, OpenAI DR, Perplexity DR), and results are contextualized against other benchmarks (LiveResearchBench, DeepResearch Bench)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "All three evaluated systems (OpenAI DR, Gemini DR, Perplexity DR) are state-of-the-art commercial deep research agents as of 2025." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 4.4 presents ablation studies on rubric design: example detail (low vs. high) and LLM augmentation (absent vs. present), shown in Table 7." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple evaluation schemes are used: binary vs. ternary grading, Macro F1 for human-LLM alignment, per-category failure rates, mandatory vs. optional criteria breakdowns." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Nine expert annotators provided ground truth across 303 responses for human-LLM judge alignment analysis (Section 4.3, Table 6)." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is a benchmark paper evaluating commercial systems, not a training/tuning study. There is no train/test split to hold out." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Extensive breakdowns are provided: by rubric axis (Fig. 5), by domain (Fig. 14), by complexity dimension (Fig. 6), by mandatory vs. optional criteria (Fig. 8)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 4.2 discusses failure patterns in detail: implicit reasoning and synthesis failures (45-50%), mandatory vs. optional criteria failures, breadth-accuracy trade-off in citations." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The LLM augmentation ablation (Table 7) shows a clearly negative result: automated rubric expansion 'catastrophically degrades alignment by 15-20%'." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims ('under 68% average compliance', 'missed implicit context and inadequate reasoning') are supported by Table 5 (67.7% max) and Fig. 5 (implicit criteria dominant failure mode)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims like 'improving Deep Research agents requires architectural innovation rather than incremental refinement' (Conclusion) and 'current architectures inherently favor creative synthesis over systematic execution' (Section 4.5) without adequate evidence for these causal/architectural claims. These are speculative explanations for correlational observations." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper tests 3 commercial DR systems on 101 prompts but makes broad claims about 'fundamental architectural limitations' and 'current systems' generally. The title and claims extend well beyond the three specific systems tested." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 4.2 discusses the length-quality conflation and considers whether correlation reflects 'genuine informational density rather than stylistic inflation.' Section 4.5 discusses multiple explanations for observed patterns." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper is explicit that rubric compliance is the proxy and discusses its relationship to actual research quality. The distinction between mandatory and optional criteria addresses sufficiency vs. excellence. Section 4.2 discusses the length-quality conflation." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "LLM judges are listed as 'GPT-5', 'Claude-Sonnet-4.5', 'Gemini-2.5-Pro' without specific version snapshots or API dates. The evaluated DR systems are named by product (e.g., 'OpenAI Deep Research') without versioning." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full evaluation prompts are provided in Figures 19-21, including the system prompt, user prompt, example removal prompt, and augmentation prompt." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for either the LLM judges or the DR systems being evaluated." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "The paper evaluates third-party commercial DR tools (OpenAI DR, Gemini DR, Perplexity DR) as black boxes. The authors cannot describe their internal scaffolding." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The three-stage data collection pipeline is described in detail (Fig. 2, Section 3.1): Expert 1 proposes, Expert 2 reviews and iterates, Expert 3 makes final adjustments. PDF-to-markdown conversion of responses is mentioned in Section 4.1." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section. The Conclusion mentions 'Future Work' but does not substantively discuss limitations of the benchmark itself." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address threats like annotator bias, prompt selection bias, or temporal validity of results." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to the 3 tested systems or discuss populations/settings excluded." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper releases all prompts, rubrics, and evaluation code. The raw human annotation data for the 303 responses appears available through the release." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.1 describes the three-expert pipeline, the source of prompt ideas (user forums, Q&A sites, brainstorming), and the iterative refinement process in detail (Fig. 2)." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "The paper mentions '9 expert annotators' for human evaluation and 'experts' for data collection but does not describe how these experts were recruited, their qualifications beyond 'strong STEM background', or potential selection biases." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from prompt creation (3-stage expert process) through evaluation (LLM-as-judge with ternary scoring) to final score calculation (Equation 1) is documented." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding sources are disclosed. Most authors are affiliated with Scale AI (a commercial company) but no funding statement is provided." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Scale AI (majority of authors), University of Maryland, University of Chicago, Washington University, McGill University, UC Berkeley." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Scale AI is a data labeling and evaluation company that sells benchmark and evaluation services. The majority of authors are Scale AI employees. Scale AI has a direct commercial interest in demonstrating the need for human-expert evaluation benchmarks, which is exactly what this paper advocates." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interest declarations are present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the LLM judges (GPT-5, Claude-Sonnet-4.5, Gemini-2.5-Pro) or the evaluated DR systems." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the benchmark prompts or rubrics could appear in LLM training data. Since the rubrics are expert-written and novel, this risk is lower but is not discussed." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether any benchmark content was available online before the models' training cutoffs. The paper does not address contamination for either the evaluated systems or the LLM judges." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": true, 246 "answer": false, 247 "justification": "The paper includes human annotators (9 experts for 303 responses). No pre-registration is mentioned." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": true, 251 "answer": false, 252 "justification": "No IRB or ethics board approval is mentioned for the human evaluation study." 253 }, 254 "demographics_reported": { 255 "applies": true, 256 "answer": false, 257 "justification": "The 9 expert annotators are not characterized beyond being 'experts'. No demographics, experience levels, or background information provided." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": true, 261 "answer": false, 262 "justification": "No inclusion/exclusion criteria for annotator selection are described. Experts are defined as 'individual with a strong STEM background who is skilled in task design and evaluation' but no formal criteria are given." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "This is not an experimental study with treatment conditions requiring randomization. It is a benchmark evaluation study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "Blinding is not clearly applicable to this benchmark evaluation design." 273 }, 274 "attrition_reported": { 275 "applies": true, 276 "answer": false, 277 "justification": "No information about whether all 9 annotators completed all assigned tasks or if there was any dropout." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs are reported for running the DR systems or the LLM judges across the 101 prompts." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget is stated. The paper mentions '2,800+ hours of human labor' for benchmark creation but not compute costs." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No seed sensitivity analysis. Results appear to be from single runs of each DR system. LLM judge results also appear unreplicated." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of runs per system is not stated. It appears each DR system was run once per prompt." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": false, 305 "answer": false, 306 "justification": "The paper evaluates commercial systems with no hyperparameter tuning. Not applicable." 307 }, 308 "best_config_selection_justified": { 309 "applies": false, 310 "answer": false, 311 "justification": "No configuration selection is involved — commercial systems are used as-is." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Multiple comparisons are made across 3 systems, multiple domains, complexity dimensions, and rubric categories with no correction applied." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": false, 320 "answer": false, 321 "justification": "The paper does not propose its own system — it evaluates third-party systems. Self-comparison bias is not applicable." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No comparison of compute budgets across systems. Perplexity produces much shorter outputs (suggesting less compute) but this is not analyzed as a compute variable." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 4.4 directly addresses construct validity of rubric-based evaluation through ablation studies on rubric design. The human consistency analysis (Table 6) validates the benchmark against human judgment." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "The paper evaluates bundled commercial DR products as products, not isolating model vs. scaffold contributions. NA since the tools are the thing being tested." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether DR systems may have encountered similar prompts or rubric patterns during training." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The LLM judges are given full rubrics — no analysis of whether this introduces bias." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between the benchmark and training data of evaluated systems." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is employed." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Even leading DR agents achieve under 68% average rubric compliance", 365 "evidence": "Table 5: Gemini DR 67.7% ternary, 61.5% binary; OpenAI DR 66.4% ternary, 59.7% binary; Perplexity DR 56.6% ternary, 48.7% binary", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Implicit reasoning and synthesis jointly account for 45-50% of all failures", 370 "evidence": "Fig. 5 shows implicit criteria failure rates of ~49% and synthesis ~25-29% across all systems", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Binary grading achieves substantially higher human-LLM agreement than ternary (0.72-0.76 vs 0.53-0.57 Macro F1)", 375 "evidence": "Table 6: binary F1 ranges 0.717-0.760, ternary ranges 0.527-0.567", 376 "supported": "strong" 377 }, 378 { 379 "claim": "LLM-based rubric augmentation catastrophically degrades alignment by 15-20%", 380 "evidence": "Table 7: augmentation drops binary F1 from 0.721-0.760 to 0.508-0.564", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Performance degrades monotonically with increased logical nesting depth", 385 "evidence": "Fig. 6/13: All three systems show decreasing scores from shallow to deep nesting, though the decrease is modest (~3-5pp for most systems)", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Systematic failures indicate fundamental architectural limitations rather than implementation differences", 390 "evidence": "Section 4.5 argues this from consistency of failure patterns, but this is an interpretive claim without direct architectural evidence", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Company evaluating commercial space it serves", 397 "detail": "Scale AI is a data labeling and evaluation company. The paper's core argument — that human-expert evaluation benchmarks are needed and LLM-generated rubrics are insufficient — directly aligns with Scale AI's commercial interests. 11 of 16 authors are Scale AI employees." 398 }, 399 { 400 "flag": "No limitations section", 401 "detail": "The paper has no dedicated limitations section. For a benchmark paper making strong claims about 'fundamental architectural limitations' of DR systems, the absence of self-critical discussion of the benchmark's own limitations is notable." 402 }, 403 { 404 "flag": "Overclaiming from limited sample", 405 "detail": "Claims about 'fundamental architectural constraints' and 'current systems' are based on testing only 3 commercial DR systems on 101 prompts. The leap from correlational findings to architectural conclusions is not justified." 406 }, 407 { 408 "flag": "No statistical significance tests", 409 "detail": "Performance differences between systems are presented without any statistical testing, yet comparative rankings are prominently discussed." 410 }, 411 { 412 "flag": "Single-run evaluation", 413 "detail": "DR systems appear to have been run only once per prompt. Given the stochastic nature of LLM outputs, single-run results may not be stable. No variance across runs is reported." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "HealthBench: Evaluating Large Language Models Towards Improved Human Health", 419 "authors": ["R. K. Arora", "J. Wei"], 420 "year": 2025, 421 "arxiv_id": "2505.08775", 422 "relevance": "Related rubric-based LLM evaluation benchmark; comparison point for human-LLM judge alignment methodology." 423 }, 424 { 425 "title": "Why Do Multi-Agent LLM Systems Fail?", 426 "authors": ["M. Cemri"], 427 "year": 2025, 428 "arxiv_id": "2503.13657", 429 "relevance": "Taxonomy of multi-agent LLM system failures (MAST); cited for reasoning-action mismatch rates in agent systems." 430 }, 431 { 432 "title": "Limits to Scalable Evaluation at the Frontier: LLM as Judge Won't Beat Twice the Data", 433 "authors": ["F. E. Dorner", "V. Y. Nastl", "M. Hardt"], 434 "year": 2025, 435 "arxiv_id": "2410.13341", 436 "relevance": "Challenges assumptions about LLM-as-judge scalability; raises concerns about circularity in LLM-generated evaluation." 437 }, 438 { 439 "title": "Towards an AI Co-Scientist", 440 "authors": ["J. Gottweis"], 441 "year": 2025, 442 "arxiv_id": "2502.18864", 443 "relevance": "AI co-scientist built on multi-agent Gemini 2.0; raises stakes for evaluating AI research capabilities." 444 }, 445 { 446 "title": "GAIA: A Benchmark for General AI Assistants", 447 "authors": ["G. Mialon"], 448 "year": 2023, 449 "arxiv_id": "2311.12983", 450 "relevance": "General AI assistant benchmark; comparison point for DR evaluation methodology." 451 }, 452 { 453 "title": "Humanity's Last Exam", 454 "authors": ["L. Phan"], 455 "year": 2025, 456 "arxiv_id": "2501.14249", 457 "relevance": "Expert-level evaluation benchmark; DR systems scored 26.6% showing capability gaps." 458 }, 459 { 460 "title": "ExpertLongBench: Benchmarking Language Models on Expert-Level Long-Form Generation Tasks with Structured Checklists", 461 "authors": ["J. Ruan"], 462 "year": 2025, 463 "arxiv_id": "2506.01241", 464 "relevance": "Related benchmark using structured checklists for long-form generation evaluation; close comparison point." 465 }, 466 { 467 "title": "Mind2Web 2: Evaluating Agentic Search with Agent-as-a-Judge", 468 "authors": ["B. Gou"], 469 "year": 2025, 470 "relevance": "Agent-as-a-Judge evaluation framework for agentic search; related evaluation methodology." 471 }, 472 { 473 "title": "HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering", 474 "authors": ["Z. Yang"], 475 "year": 2018, 476 "doi": "10.18653/v1/D18-1259", 477 "relevance": "Multi-hop QA benchmark; cited for limitations of multi-hop reasoning in current agent architectures." 478 }, 479 { 480 "title": "BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents", 481 "authors": ["J. Wei"], 482 "year": 2025, 483 "arxiv_id": "2504.12516", 484 "relevance": "Benchmark for browsing agents; comparison point in the DR benchmark landscape." 485 } 486 ] 487 }