scan-v5.json (27556B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Foundational Automatic Evaluators: Scaling Multi-Task Generative Evaluator Training for Reasoning-Centric Domains", 6 "authors": [ 7 "Austin Xu", 8 "Xuan-Phi Nguyen", 9 "Yilun Zhou", 10 "Chien-Sheng Wu", 11 "Caiming Xiong", 12 "Shafiq Joty" 13 ], 14 "year": 2025, 15 "venue": "arXiv.org", 16 "arxiv_id": "2510.17793", 17 "doi": "10.48550/arXiv.2510.17793" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "Abstract claims (FARE-8B challenging larger evaluators, FARE-20B surpassing 70B+ models, near-oracle MATH reranking, 14.1% RL training gain, 65% code evaluation improvement) are all backed by Tables 1-3 and Figures 3-5.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "Causal claims about training components are supported by ablation studies in Table 6, which systematically vary direct judgment data proportion, curriculum learning, and CoT retention strategy.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper explicitly scopes claims to reasoning-centric domains in the title and throughout, and reports per-benchmark performance rather than sweeping generalizations.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper does not discuss alternative explanations for FARE's strong performance — whether results stem primarily from data scale, base model quality, training method, or domain coverage is not systematically disentangled.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper clearly distinguishes benchmark evaluation (static benchmarks for evaluator quality) from downstream real-world performance (RL training, inference-time reranking), with appropriate metrics for each.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": false, 56 "justification": "There is no dedicated limitations section. Brief future work mentions appear in Appendix B.2 but no limitations or threats-to-validity section exists.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": false, 62 "justification": "No specific threats to validity are discussed, such as benchmark saturation, base model contamination, or limited evaluator generalization outside tested reasoning domains.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": false, 68 "justification": "The paper focuses on reasoning-centric domains but does not explicitly state what its results do NOT show (e.g., no claims about non-English, long-form creative, or multilingual evaluation settings).", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding disclosure is present. All authors are Salesforce AI Research employees but no external funding or grant acknowledgment appears in the paper.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All authors are clearly identified as Salesforce AI Research affiliates on the title page with contact emails.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": false, 88 "justification": "Salesforce employees train and evaluate their own FARE models; there is no independent evaluation by parties without a stake in the outcome.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper formally defines automatic evaluator (AE), input/output spaces, and all five evaluation tasks (pairwise, step-level, reference-based verification, reference-free verification, single rating) with mathematical notation in Section 2.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Three explicit contributions are enumerated in the introduction: multi-task dataset curation, scalable RS-SFT training recipe, and the FARE family of evaluators with rigorous evaluation.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 2 and Appendix A thoroughly situate FARE relative to prompted evaluators, SFT/DPO-trained evaluators, RL-trained evaluators, and earlier foundational evaluators, explaining key differences from STE, EvalPlanner, CompassJudger, and J1.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": false, 125 "justification": "No code release is mentioned anywhere in the paper. The training framework (OpenRLHF, verl) is referenced but no repository link for the FARE training pipeline is provided.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": false, 131 "justification": "The 2.5M curated training samples are not released. Evaluation uses public benchmarks, but the novel training dataset (including synthetic data and rubrics) is proprietary.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "While OpenRLHF and verl frameworks are named and hyperparameters listed, no requirements file, Dockerfile, or full dependency specification is provided.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "Appendix B.2 provides training hyperparameters but without code, training data, or step-by-step instructions, the work cannot be reproduced.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "All results in Tables 1-4 and Figures 3-5 are single point estimates with no confidence intervals or error bars reported.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "No statistical significance tests are applied to any comparative claims; performance differences are stated as absolute point improvements without any testing of whether they exceed chance.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Absolute point differences are consistently reported in context (e.g., FARE-8B beats J1-8B by 13.71 points on JudgeBench, 14.1% relative gain over string-matching verifiers), providing effect size context.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "The 2.5M training sample size is motivated by the scaling hypothesis from prior work but no formal sample size justification or power analysis is provided.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "All benchmark evaluations are single runs with no variance, standard deviation, or inter-run variability reported across any experiment.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Extensive baselines are included: RISE-Judge, EvalPlanner, J1, RM-R1, CompassJudger, Atla Selene, SFR-Judge, Skywork-Critic, StepWiser, and frontier models like GPT-4o, GPT-5, and gpt-oss-120B.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "Baselines include very recent 2025 RL-trained models (J1, RM-R1, StepWiser) and frontier models (GPT-5, gpt-oss-120B), all contemporary at time of publication.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "Table 6 ablates proportion of direct judgment data (30-70%), continuous curriculum vs. random shuffling, and CoT retention strategy for the 20B model, quantifying each component's impact on pairwise and step-level benchmarks.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "The paper uses consistent accuracy for pairwise benchmarks, F1 for ProcessBench, Pearson correlation for single-rating tasks, and accuracy for VerifyBench, across 7 core benchmarks and 3 downstream settings.", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": false, 206 "answer": false, 207 "justification": "The paper trains and evaluates automated evaluators on automated benchmarks; human evaluation of FARE outputs is not conducted and is clearly not relevant to this benchmarking paradigm.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "All evaluation is on held-out test benchmarks (JudgeBench, ProcessBench, VerifyBench, etc.) separate from training data, with explicit N-gram decontamination applied.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "ProcessBench results are broken down by difficulty (GSM8K, MATH, OlympiadBench, OmniMATH); CodingJudgeBench by task type; JETTS provides per-generator and per-benchmark breakdowns in Table 10.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section D.6 explicitly notes FARE-8B fails to improve larger generators on harder benchmarks in JETTS; D.2 shows removing CoT from FARE-20B degrades most benchmark scores; Table 4 shows SC hurts MBPP+.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "Negative results include: self-consistency degrades FARE performance on MBPP+, removing CoT from FARE-20B reduces most benchmark scores, and FARE-8B cannot universally improve generator performance in reranking.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "Base models are specifically identified as Qwen3-8B-Base and gpt-oss-20B with arXiv citations; all 12 generator models for synthetic data are enumerated by name and model family in Appendix B.1.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "Appendix E.1 provides full verbatim prompts for pairwise evaluation, direct judgment pairwise, step-level evaluation, and reference-based verification, with all placeholder variables identified.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": true, 251 "justification": "Training hyperparameters are reported throughout: batch size 128, learning rate 1e-6, rollout batch sizes 50K/250K, K=4 rollout samples at temperature 0.9, and KL coefficient 0.001 for GRPO experiments.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "This paper trains evaluator models without agentic scaffolding; no agentic framework is used in the experimental setup.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Appendix B.1 describes N-gram decontamination, hand-crafted rubric creation per dataset, programmatic error injection details, and the generate-then-grade procedure with temperature sampling specifics.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": false, 271 "justification": "The 2.5M training samples are not released; only Table 5 listing source datasets is provided, making independent verification of the curated dataset impossible.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "Section 3.1 and Appendix B.1 describe both existing data collection (sources, rubric creation) and synthetic data generation (programmatic error injection and generate-then-grade) in substantial detail with Table 5 enumerating all 24 source datasets.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants; all data comes from existing public datasets and automated synthesis pipelines.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "The full pipeline from seed datasets through rubric creation, response generation (12 generators), correctness grading, N-gram decontamination, and final dataset composition is documented across Section 3.1 and Appendix B.1.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Training cutoffs for the base models (Qwen3-8B-Base, gpt-oss-20B) are not stated, making it unclear whether benchmark examples were available during base model pre-training.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": true, 303 "justification": "Appendix B.1 explicitly states they applied N-gram matching decontamination following Guha et al. (2025) to remove fine-tuning training samples overlapping with evaluation benchmarks.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": true, 309 "justification": "The paper explicitly addresses potential benchmark contamination through N-gram matching decontamination of training sets and focuses on modern (2024+) datasets to reduce temporal overlap.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants; pre-registration is not applicable.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants; IRB/ethics approval is not applicable.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "The paper discusses efficiency as a design goal and compares model sizes/active parameters, but reports no specific inference latency, throughput, or cost numbers.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "Training details (batch size, rollout batch size, steps) are provided but total GPU-hours or compute budget for training FARE-8B or FARE-20B is not disclosed.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "FARE-8B outperforms RL-trained evaluators of comparable or larger size on JudgeBench", 376 "evidence": "Table 1 shows FARE-8B scores 55.71 on JudgeBench vs J1-8B (42.00) and RM-R1-14B (46.86), a 13.71 and 8.85 point margin respectively", 377 "supported": "strong" 378 }, 379 { 380 "claim": "FARE-20B sets a new standard for open-source evaluators, surpassing specialized 70B+ models", 381 "evidence": "Table 1 shows FARE-20B (64.29 JudgeBench, 74.4 PPE) outperforming EvalPlanner-70B (56.60, 70.2) and J1-70B (60.00, 72.8) despite 3.5x fewer total and ~20x fewer active parameters", 382 "supported": "strong" 383 }, 384 { 385 "claim": "FARE-20B achieves near-oracle inference-time reranking performance on MATH", 386 "evidence": "Figure 3 shows FARE-20B approaching the oracle green line on MATH across multiple generators, outperforming SFR-Judge-70B by 14 points and Skywork-Critic-70B by 21 points on Llama-3.1-8B generator", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Using FARE-20B as verifiers in GRPO training improves downstream model performance by 14.1% over string-matching verifiers", 391 "evidence": "Figure 4 shows Qwen2.5-7B-Base trained with FARE-20B verifier reaches 45.2 vs 39.6 (string matching); the 14.1% figure is relative improvement, single run without variance", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "Continual finetuning of FARE-20B for code with only 15K samples (FARE-20B-Code) outperforms gpt-oss-120B on average", 396 "evidence": "Figure 5 shows FARE-20B-Code average consistent accuracy exceeds gpt-oss-120B across three CodingJudgeBench tasks, with 10.48 point gain on test-case quality over FARE-20B", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "Large-scale RS-SFT without RL is competitive with RL-trained specialized evaluators", 401 "evidence": "FARE-8B and FARE-20B trained with rejection sampling SFT outperform RL-trained models (J1, RM-R1, StepWiser) on most benchmarks in Tables 1-3", 402 "supported": "strong" 403 }, 404 { 405 "claim": "Positional robustness in pairwise evaluation emerges as a function of training data scale", 406 "evidence": "Figure 6 shows pairwise consistency increasing monotonically from ~65% to ~80% as training samples increase from 0 to 2.5M for both Qwen3 and Qwen2.5 initializations", 407 "supported": "moderate" 408 } 409 ], 410 "methodology_tags": [ 411 "benchmark-eval" 412 ], 413 "key_findings": "FARE demonstrates that scaling training data to 2.5M multi-task, multi-domain samples with iterative rejection sampling SFT achieves state-of-the-art performance for generative evaluators without computationally expensive RL training. FARE-8B at 8B parameters matches or exceeds specialized RL-trained evaluators at 14B+ parameters on reasoning benchmarks, while FARE-20B with 3.6B active parameters outperforms dense 70B+ specialized judges across 7 benchmarks. In downstream applications, FARE-20B achieves near-oracle best-of-10 reranking on MATH and yields 14.1% relative improvement over string-matching verifiers in GRPO RL training. An additional finding is that positional robustness emerges naturally with data scale, suggesting data-driven training can mitigate common evaluator biases without targeted interventions.", 414 "red_flags": [ 415 { 416 "flag": "No statistical testing", 417 "detail": "All comparative claims are made on single-run point estimates without confidence intervals, error bars, or significance tests, making it impossible to determine if performance differences are reliable or within noise." 418 }, 419 { 420 "flag": "No code or training data release", 421 "detail": "Neither the training pipeline code nor the 2.5M curated training samples are released, making reproduction effectively impossible despite the hyperparameter details provided." 422 }, 423 { 424 "flag": "Self-evaluation only", 425 "detail": "All evaluations are conducted by the Salesforce team that developed FARE with no independent evaluation by external parties." 426 }, 427 { 428 "flag": "No compute budget disclosed", 429 "detail": "Total GPU-hours or compute cost for training FARE-8B and FARE-20B is not reported, preventing assessment of practical reproducibility or cost-effectiveness." 430 }, 431 { 432 "flag": "Base model contamination unaddressed", 433 "detail": "Training cutoffs for base models (Qwen3-8B-Base, gpt-oss-20B) are not stated; these models' pretraining data may overlap with evaluation benchmarks in ways the fine-tuning-level N-gram decontamination cannot address." 434 }, 435 { 436 "flag": "No limitations section", 437 "detail": "The paper has no dedicated limitations section; scope boundaries regarding language, domain coverage, model scale, and benchmark generalization are not explicitly stated." 438 } 439 ], 440 "cited_papers": [ 441 { 442 "title": "Foundational Autoraters: Taming Large Language Models for Better Automatic Evaluation", 443 "relevance": "Direct precursor introducing the foundational evaluator training paradigm; FARE extends this with larger data scale and iterative training" 444 }, 445 { 446 "title": "Direct Judgement Preference Optimization", 447 "relevance": "Related multi-task foundational evaluator using direct judgment data; key methodological comparison and baseline" 448 }, 449 { 450 "title": "Self-Taught Evaluators", 451 "relevance": "Related iterative SFT approach for training evaluators; contrasted with FARE in terms of data scale, task coverage, and training stability" 452 }, 453 { 454 "title": "J1: Incentivizing Thinking in LLM-as-a-Judge via Reinforcement Learning", 455 "relevance": "Key RL-trained evaluator baseline that FARE claims to match or outperform despite simpler training methodology" 456 }, 457 { 458 "title": "RM-R1: Reward Modeling as Reasoning", 459 "relevance": "RL-trained evaluator baseline; FARE-8B outperforms RM-R1-14B on most benchmarks, supporting the data-scaling argument" 460 }, 461 { 462 "title": "JudgeBench: A Benchmark for Evaluating LLM-based Judges", 463 "relevance": "Primary pairwise reasoning evaluation benchmark used throughout; introduces consistent accuracy metric adopted by this paper" 464 }, 465 { 466 "title": "ProcessBench: Identifying Process Errors in Mathematical Reasoning", 467 "relevance": "Step-level evaluation benchmark where FARE-20B achieves state-of-the-art performance, matching GPT-5" 468 }, 469 { 470 "title": "Evaluating Judges as Evaluators: The JETTS Benchmark of LLM-as-Judges as Test-Time Scaling Evaluators", 471 "relevance": "Framework for downstream inference-time scaling evaluation; used to assess FARE as a best-of-N reranker across multiple generators and tasks" 472 }, 473 { 474 "title": "General-Reasoner: Advancing LLM Reasoning Across All Domains", 475 "relevance": "Provides the WebInstruct-Verified training setup and General-Verifier baseline for GRPO training experiments; FARE-20B verifier is compared against their approach" 476 } 477 ], 478 "engagement_factors": { 479 "practical_relevance": { 480 "score": 3, 481 "justification": "FARE directly addresses high-demand infrastructure needs for scalable evaluators in RL training and inference-time scaling, with demonstrated practical gains in both settings using off-the-shelf training techniques." 482 }, 483 "surprise_contrarian": { 484 "score": 2, 485 "justification": "Challenges the dominant narrative that RL training is necessary for state-of-the-art evaluators, showing simple data scaling with RS-SFT matches or beats RL-trained models at far lower compute cost." 486 }, 487 "fear_safety": { 488 "score": 0, 489 "justification": "No AI safety concerns are raised; the paper is a systems/ML engineering contribution about training better automated evaluators." 490 }, 491 "drama_conflict": { 492 "score": 1, 493 "justification": "Implicitly critiques the recent trend toward RL-based evaluator training as unnecessary complexity, but frames this as a finding rather than a confrontational argument." 494 }, 495 "demo_ability": { 496 "score": 1, 497 "justification": "No model weights release or demo link is provided in the paper text; the models may be available but are not publicized in this preprint." 498 }, 499 "brand_recognition": { 500 "score": 2, 501 "justification": "Salesforce AI Research is a recognized industrial AI lab; the paper benchmarks against and claims to outperform OpenAI's GPT-5 on several evaluation tasks." 502 } 503 }, 504 "hn_data": { 505 "threads": [ 506 { 507 "hn_id": "45657595", 508 "title": "Binary Retrieval-Augmented Reward Mitigates Hallucinations", 509 "points": 44, 510 "comments": 3, 511 "url": "https://news.ycombinator.com/item?id=45657595", 512 "created_at": "2025-10-21T16:14:28Z" 513 }, 514 { 515 "hn_id": "42984225", 516 "title": "Leveraging Multimodal LLM for Inspirational User Interface Search", 517 "points": 2, 518 "comments": 0, 519 "url": "https://news.ycombinator.com/item?id=42984225", 520 "created_at": "2025-02-08T16:52:28Z" 521 }, 522 { 523 "hn_id": "45876369", 524 "title": "Diagnosing Representation Dynamics in NER Model Extension", 525 "points": 1, 526 "comments": 0, 527 "url": "https://news.ycombinator.com/item?id=45876369", 528 "created_at": "2025-11-10T14:30:09Z" 529 } 530 ], 531 "top_points": 44, 532 "total_points": 47, 533 "total_comments": 3 534 } 535 }