scan.json (28511B)
1 { 2 "paper": { 3 "title": "Proof of Time: A Benchmark for Evaluating Scientific Idea Judgments", 4 "authors": ["Bingyang Ye", "Shan Chen", "Jingxuan Tu", "Chen Liu", "Zidi Xiong", "Samuel Schmidgall", "Danielle S. Bitterman"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2601.07606", 8 "doi": "10.48550/arXiv.2601.07606" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "PoT introduces a time-partitioned benchmarking framework for evaluating scientific idea judgments across 30K+ instances in four domains (citations, awards, faculty research evolution, SOTA forecasting). Tool-using agents show large gains on evidence-exploration tasks (Faculty) but smaller or inconsistent gains on structured prediction tasks. Test-time compute scaling generally helps but varies by model family, and structured prompting is not a universal improvement. Post-cutoff evaluation materially shifts model rankings compared to pre-cutoff evaluation.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides a GitHub link in the abstract: 'Code, data, and evaluation scripts are at https://github.com/shan23chen/proof_of_time'." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The abstract states code, data, and evaluation scripts are released at the GitHub repository. The benchmark instances are described as JSONL files with a unified schema (Appendix A.8)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No mention of requirements.txt, Dockerfile, conda environment, or specific library versions in the paper. The sandboxed Docker environment is mentioned for running agents but no dependency specifications are provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The paper references a GitHub repo but does not include a 'Reproducing Results' section or specific commands." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results are reported as point estimates (e.g., '35.1%' accuracy). Figure 2B shows error bars described as 'variability across task–model combinations' but the main tables (3, 7) report only point estimates without confidence intervals." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes comparative claims (e.g., 'Claude models show the steepest gains') but no statistical significance tests are reported. All comparisons are based on point estimate differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Effect sizes are reported as percentage-point differences with baseline context, e.g., Table 3 shows pre-cutoff vs post-cutoff with deltas (+21.6pp, -17.7pp), and Table 7 shows scaling gains (Δ 15→50). These provide magnitude in context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "Table 6 reports sample sizes per task (e.g., 200 for citations, 73 for professor field, 9 for field focus) but there is no justification for why these sizes were chosen or whether they provide adequate statistical power." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviations, confidence intervals, or variance across runs are reported. The paper does not mention running experiments multiple times or reporting spread measures. Single-run results appear to be the norm." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Zero-shot (direct generation) serves as the baseline against agentic configurations. Section 4.2 describes three solver configurations compared systematically." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The models evaluated include frontier models from 2025-2026 (GPT-5.x, Claude 4.5, Gemini 2.5/3.x) as listed in Table 2. These are contemporary." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper conducts controlled ablations over tool access (zero-shot vs agentic), structured prompting (vanilla vs structured prompt), and message-budget scaling (15/30/50), as described in Sections 4.2-4.3 and Figure 2." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "The paper reports only exact-match accuracy across all tasks (Section 4.1: 'We report exact-match accuracy for all tasks'). No secondary metrics such as calibration, rank correlation, or partial credit are used." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of the system outputs is performed. The LLM-as-judge analysis (Section 6) uses Gemini 3 pro, not human judges, to analyze agent traces." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The benchmark uses a temporal split: pre-cutoff data (2021-2024) as evidence, post-cutoff data (2025) as test. For Awards, 200 pre-cutoff papers are excluded from evidence to avoid leakage (Appendix A.3)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by task family (Citations, Awards, Faculty, SOTA) in Figure 2B, by model in Tables 3 and 7, and by model-task combination in Figure 7's heatmap." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6 provides extensive failure analysis with a taxonomy (Table 4: reasoning errors 37.7%, retrieval/tooling 36.3%, etc.) and incomplete run analysis (Table 5). Table 10 provides bottleneck categories." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that structured prompting sometimes hurts (Section 5.3: GPT models 'often neutral or slightly worse'), that agents provide no benefit on SOTA tasks, and that post-cutoff evaluation can decrease scores (Table 3: Gemini 2.5 Flash -17.7pp)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims about higher interaction budgets improving performance, task-dependent tool use benefits, and 30K+ instances are all supported by Tables 3, 7, Figures 2A-C, and Table 6." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper uses controlled ablations (tool access on/off, prompt variants, budget levels) with all other variables held constant. Section 4.2 describes the three solver configurations designed for controlled comparison. The causal claims about tool use and budget are supported by single-variable manipulations." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper bounds its claims to NLP venues (ACL, NAACL, EMNLP), specific time windows, and acknowledges limitations of proxy targets, temporal choices, and offline sandbox realism in the Limitations section." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The Limitations section discusses proxy target imperfections (visibility, community dynamics), offline sandbox departure from real-world use, and the possibility that different agent architectures could change results. Section 5.4 notes pre-cutoff results may reflect both exposure and difficulty differences." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly states 'We call PoT semi-verifiable because the benchmark uses verifiable downstream outcomes as imperfect proxies for idea quality. The signal is verifiable, but the target construct is not directly observable.' (Section 3). The Limitations section further discusses proxy limitations." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Table 2 lists models as 'Claude Opus 4.5', 'GPT-5.2', 'Gemini 3 Pro Preview' etc. without specific API versions, snapshot dates, or model IDs. These are marketing names without version identifiers." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "The structured agent prompt is provided verbatim in Figure 14 (Appendix G). Task instances are described as JSONL with the user-facing prompt included (Appendix A.6-A.8)." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix B states: 'All models were evaluated using their default API sampling parameters (temperature, top_p, etc.) without task-specific tuning.' Infrastructure settings are also specified (max retries: 3, timeout: 10 min for agent, 3 min for zero-shot, concurrent connections: 10)." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 4.5 describes 'All agentic runs use a single-agent ReAct loop in a sandboxed environment.' Section 4.2 details the three solver configurations. The agent has 'access to bash, python, and a text editor.' The structured prompt is in Figure 14." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix A describes data collection in detail: paper metadata from OpenReview (A.1), citation data from Google Scholar with temporal partitioning (A.2), award parsing (A.3), SOTA snapshot construction (A.4), professor publication history assembly with disambiguation heuristics (A.5)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "A dedicated 'Limitations' section follows the Conclusion, discussing proxy targets, temporal choices, offline sandbox realism, and agent loop constraints." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The Limitations section addresses specific threats: proxy target imperfections (citations influenced by visibility), the particular cutoff dates chosen affecting difficulty profiles, offline sandbox departing from real-world retrieval, and the single-agent ReAct loop potentially not generalizing to other architectures." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states what it does not claim: 'Results should therefore be interpreted as measuring performance under a constrained evidence regime rather than full open-world assistance.' It also limits claims to the specific agent architecture and venues studied." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The GitHub repository promises code, data, and evaluation scripts. Appendix A.6 describes JSONL instance formats with exact sandbox manifests. The paper states 'We log full interaction traces, tool calls, and termination status' (Section 4.5)." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Appendix A describes data collection in detail: OpenReview for paper metadata (A.1), Google Scholar for citations with specific temporal windows (A.2), award tiers from conference records (A.3), and SOTA from public leaderboards (A.4)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public repositories (OpenReview, Google Scholar, public leaderboards)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Appendix A documents the pipeline: collection at t0, normalization into common schema, construction of sandbox manifests, collection at t1 for labels. Task instance format is documented (A.6-A.8). Table 6 provides final counts per task." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding sources are mentioned in the paper. There is no acknowledgments section listing grants or sponsors." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Harvard University, Mass General Brigham, Boston Children's Hospital, Brandeis University, Yale University, Johns Hopkins University." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding source is disclosed, so independence cannot be assessed. The paper evaluates commercial models (OpenAI, Google, Anthropic) — potential conflicts are not discussed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": true, 230 "justification": "Appendix A states: 'we define t0 as January 2025 for all models whose knowledge cutoff precedes that date... The only exception is GPT-5.2, whose knowledge cutoff is August 31, 2025, so for GPT-5.2 we set t0 to September 1, 2025.'" 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "The paper's core design addresses this: Section 3.1 describes time-partitioned evaluation separating pre-cutoff evidence from post-cutoff targets. Section 5.4 (RQ4) explicitly evaluates contamination effects by comparing pre-cutoff vs post-cutoff performance." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Contamination resistance is a central design principle. Section 3.2 describes the offline sandbox to 'minimize leakage through opportunistic retrieval.' The temporal partitioning design ensures post-cutoff labels were not available during model training. Section 2.2 discusses this in the context of related work." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 8 reports total API spend per model (e.g., GPT-5.2: $212.10, Claude Opus 4.5: $1781.51). Table 9 reports token usage per run across message budgets. Section 7 discusses cost-performance tradeoffs." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Table 8 reports total API spend of $11,364.33 for all experiments. Table 9 provides detailed token usage. Appendix D provides cost-performance analysis." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of running experiments across multiple random seeds. Results appear to be from single runs per configuration." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not state how many runs produced each result. No mention of 'averaged over K runs' or similar." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is reported. The paper uses default API parameters (Appendix B), but the structured prompt design and message limits were presumably tuned without reporting a search budget." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The paper reports results for all configurations (zero-shot, agentic, agentic+structured prompt) at all budget levels (15/30/50) rather than selecting a best configuration. This comprehensive reporting avoids cherry-picking." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. The paper makes many comparisons across models, tasks, and configurations without any correction." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors designed the benchmark and evaluate models on it without discussing potential bias from benchmark design choices favoring certain model behaviors." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Section 7 and Figures 11-12 explicitly analyze performance as a function of compute budget (token overhead vs accuracy gain). The message-budget scaling analysis (15/30/50) also addresses this." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "The paper explicitly discusses construct validity: 'We call PoT semi-verifiable because the benchmark uses verifiable downstream outcomes as imperfect proxies for idea quality' (Section 3). The Limitations section discusses proxy target imperfections." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": true, 336 "justification": "All models use the same agentic environment and tool interfaces: 'For agentic runs, the environment and tool interfaces are held constant across models' (Section 4.4). The scaffold is controlled across model comparisons." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "Temporal leakage is the core design consideration. Section 3.1 formalizes time-partitioned evaluation. The cutoff t0 is set before models' knowledge cutoffs to prevent temporal leakage. GPT-5.2 gets a different t0 due to its later cutoff (Appendix A)." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "The offline sandbox (Section 3.2) prevents feature leakage by mounting only pre-cutoff artifacts: 'Networking is disabled, and each task instance mounts a manifested set of read-only artifacts.' Post-cutoff labels are not accessible to solvers." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "The paper does not discuss whether pre-cutoff evidence papers and post-cutoff evaluation papers share structural similarities (same labs, same topics, near-duplicate methods) that could inflate performance." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": true, 358 "justification": "The pre-cutoff vs post-cutoff comparison (Section 5.4, Table 3) serves as a contamination detection method, revealing which models may have been exposed to historical targets. The temporal split itself is a leakage prevention method." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Higher interaction budgets generally improve agentic performance across most models.", 365 "evidence": "Table 7 shows 15→50 message limit gains ranging from +12.1pp (GPT-5 nano) to +28.6pp (Gemini 3 Pro). Figure 2A visualizes these scaling curves.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "The benefit of tool use is strongly task dependent, with Faculty showing the largest improvement and SOTA showing minimal gain.", 370 "evidence": "Figure 2B shows Faculty improving from near-zero (zero-shot) to ~two-thirds (agentic), Citations gaining ~10pp, Awards changing little, and SOTA remaining near ceiling in both settings.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Structured agent prompts have family-dependent and model-dependent effects, not universal improvements.", 375 "evidence": "Figure 2C shows Claude models tend to benefit, GPT models are neutral or slightly worse, and Gemini models show mixed outcomes (Section 5.3).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Post-cutoff evaluation materially shifts conclusions about model performance compared to pre-cutoff evaluation.", 380 "evidence": "Table 3 shows shifts ranging from -17.7pp (Gemini 2.5 Flash) to +25.1pp (GPT-5.2) on Awards when moving from pre-cutoff to post-cutoff evaluation.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Reasoning errors and retrieval/tooling failures are the dominant failure modes in agentic runs.", 385 "evidence": "Table 4 shows reasoning errors at 37.7% and retrieval/tooling at 36.3% of complete-wrong runs. Table 5 shows looping/thrashing at 36.5% of incomplete runs (Section 6).", 386 "supported": "strong" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No variance or uncertainty quantification", 392 "detail": "All results are single-run point estimates with no standard deviations, confidence intervals, or multiple-seed analysis. Given the stochastic nature of LLM outputs, single runs may not be reliable. The paper's own findings about 'lucky correctness' (Table 11: 43.1%) suggest results could be noisy." 393 }, 394 { 395 "flag": "Very small sample sizes for some tasks", 396 "detail": "Field Focus has only 9 instances (Table 6). SOTA Bucket has 45 instances. Professor Field has 73. These small sizes make percentage-point comparisons unreliable without statistical tests." 397 }, 398 { 399 "flag": "LLM-as-judge for failure analysis without validation", 400 "detail": "The failure taxonomy (Section 6) relies entirely on an LLM judge (Gemini 3 pro) without human validation of the judge's accuracy. The paper does not report inter-annotator agreement or judge reliability." 401 }, 402 { 403 "flag": "Single evaluation metric", 404 "detail": "Only exact-match accuracy is reported. For tasks like citation bucket prediction or ranking, partial-credit metrics (e.g., rank correlation, bucket distance) could reveal different patterns but are not used." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "React: Synergizing reasoning and acting in language models", 410 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], 411 "year": 2023, 412 "relevance": "Core agentic framework (ReAct loop) used as the agent architecture in PoT's experimental setup." 413 }, 414 { 415 "title": "Toolformer: Language models can teach themselves to use tools", 416 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"], 417 "year": 2023, 418 "arxiv_id": "2302.04761", 419 "relevance": "Foundational work on LLM tool use, directly relevant to evaluating tool-using agents." 420 }, 421 { 422 "title": "Are emergent abilities of large language models a mirage?", 423 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 424 "year": 2023, 425 "relevance": "Challenges claims about emergent capabilities in LLMs, relevant to evaluating benchmark design and metric choice effects." 426 }, 427 { 428 "title": "Large language models cannot self-correct reasoning yet", 429 "authors": ["Jie Huang", "Xinyun Chen", "Swaroop Mishra"], 430 "year": 2023, 431 "arxiv_id": "2310.01798", 432 "relevance": "Relevant to understanding agent failure modes and the limits of self-correction in agentic reasoning loops." 433 }, 434 { 435 "title": "Benchmark data contamination of large language models: A survey", 436 "authors": ["Cheng Xu", "Shuhao Guan", "Derek Greene", "M.-Tahar Kechadi"], 437 "year": 2024, 438 "arxiv_id": "2406.04244", 439 "relevance": "Directly relevant survey on benchmark contamination, which PoT's temporal design aims to address." 440 }, 441 { 442 "title": "Livebench: A challenging, contamination-limited LLM benchmark", 443 "authors": ["Colin White", "Samuel Dooley", "Manley Roberts"], 444 "year": 2025, 445 "arxiv_id": "2406.19314", 446 "relevance": "Live benchmark approach to contamination resistance, parallel design philosophy to PoT's time-partitioned approach." 447 }, 448 { 449 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 450 "authors": ["Naman Jain", "King Han", "Alex Gu"], 451 "year": 2024, 452 "arxiv_id": "2403.07974", 453 "relevance": "Contamination-free code evaluation benchmark using temporal partitioning, relevant to benchmark design methodology." 454 }, 455 { 456 "title": "Large language models are not fair evaluators", 457 "authors": ["Peiyi Wang", "Lei Li", "Liang Chen"], 458 "year": 2024, 459 "relevance": "Documents inconsistencies in LLM-based judging, relevant to evaluating AI systems as scientific idea judges." 460 }, 461 { 462 "title": "Survey on evaluation of LLM-based agents", 463 "authors": ["Asaf Yehudai", "Lilach Eden", "Alan Li"], 464 "year": 2025, 465 "arxiv_id": "2503.16416", 466 "relevance": "Survey of agent evaluation methods including cost-efficiency and robustness, directly relevant to agentic evaluation methodology." 467 }, 468 { 469 "title": "Towards a science of scaling agent systems", 470 "authors": ["Yubin Kim", "Ken Gu", "Chanwoo Park"], 471 "year": 2025, 472 "arxiv_id": "2512.08296", 473 "relevance": "Framework for understanding when agents help vs. when they don't, directly relevant to PoT's agent evaluation findings." 474 }, 475 { 476 "title": "An information theoretic perspective on agentic system design", 477 "authors": ["Shizhe He", "Avanika Narayan", "Ishan S. Khare"], 478 "year": 2025, 479 "arxiv_id": "2512.21720", 480 "relevance": "Theoretical framework for agent design, relevant to understanding when agentic approaches provide value." 481 }, 482 { 483 "title": "ToolSandbox: A stateful, conversational, interactive evaluation benchmark for LLM tool use capabilities", 484 "authors": ["Jiarui Lu", "Thomas Holleis", "Yizhe Zhang"], 485 "year": 2025, 486 "relevance": "Sandboxed tool-use evaluation benchmark, directly comparable to PoT's offline sandbox approach." 487 } 488 ] 489 }