ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (27556B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Foundational Automatic Evaluators: Scaling Multi-Task Generative Evaluator Training for Reasoning-Centric Domains",
      6     "authors": [
      7       "Austin Xu",
      8       "Xuan-Phi Nguyen",
      9       "Yilun Zhou",
     10       "Chien-Sheng Wu",
     11       "Caiming Xiong",
     12       "Shafiq Joty"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2510.17793",
     17     "doi": "10.48550/arXiv.2510.17793"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims (FARE-8B challenging larger evaluators, FARE-20B surpassing 70B+ models, near-oracle MATH reranking, 14.1% RL training gain, 65% code evaluation improvement) are all backed by Tables 1-3 and Figures 3-5.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Causal claims about training components are supported by ablation studies in Table 6, which systematically vary direct judgment data proportion, curriculum learning, and CoT retention strategy.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper explicitly scopes claims to reasoning-centric domains in the title and throughout, and reports per-benchmark performance rather than sweeping generalizations.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss alternative explanations for FARE's strong performance — whether results stem primarily from data scale, base model quality, training method, or domain coverage is not systematically disentangled.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper clearly distinguishes benchmark evaluation (static benchmarks for evaluator quality) from downstream real-world performance (RL training, inference-time reranking), with appropriate metrics for each.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "There is no dedicated limitations section. Brief future work mentions appear in Appendix B.2 but no limitations or threats-to-validity section exists.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No specific threats to validity are discussed, such as benchmark saturation, base model contamination, or limited evaluator generalization outside tested reasoning domains.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper focuses on reasoning-centric domains but does not explicitly state what its results do NOT show (e.g., no claims about non-English, long-form creative, or multilingual evaluation settings).",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding disclosure is present. All authors are Salesforce AI Research employees but no external funding or grant acknowledgment appears in the paper.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All authors are clearly identified as Salesforce AI Research affiliates on the title page with contact emails.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Salesforce employees train and evaluate their own FARE models; there is no independent evaluation by parties without a stake in the outcome.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper formally defines automatic evaluator (AE), input/output spaces, and all five evaluation tasks (pairwise, step-level, reference-based verification, reference-free verification, single rating) with mathematical notation in Section 2.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Three explicit contributions are enumerated in the introduction: multi-task dataset curation, scalable RS-SFT training recipe, and the FARE family of evaluators with rigorous evaluation.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 and Appendix A thoroughly situate FARE relative to prompted evaluators, SFT/DPO-trained evaluators, RL-trained evaluators, and earlier foundational evaluators, explaining key differences from STE, EvalPlanner, CompassJudger, and J1.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No code release is mentioned anywhere in the paper. The training framework (OpenRLHF, verl) is referenced but no repository link for the FARE training pipeline is provided.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "The 2.5M curated training samples are not released. Evaluation uses public benchmarks, but the novel training dataset (including synthetic data and rubrics) is proprietary.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "While OpenRLHF and verl frameworks are named and hyperparameters listed, no requirements file, Dockerfile, or full dependency specification is provided.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "Appendix B.2 provides training hyperparameters but without code, training data, or step-by-step instructions, the work cannot be reproduced.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results in Tables 1-4 and Figures 3-5 are single point estimates with no confidence intervals or error bars reported.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are applied to any comparative claims; performance differences are stated as absolute point improvements without any testing of whether they exceed chance.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Absolute point differences are consistently reported in context (e.g., FARE-8B beats J1-8B by 13.71 points on JudgeBench, 14.1% relative gain over string-matching verifiers), providing effect size context.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The 2.5M training sample size is motivated by the scaling hypothesis from prior work but no formal sample size justification or power analysis is provided.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "All benchmark evaluations are single runs with no variance, standard deviation, or inter-run variability reported across any experiment.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Extensive baselines are included: RISE-Judge, EvalPlanner, J1, RM-R1, CompassJudger, Atla Selene, SFR-Judge, Skywork-Critic, StepWiser, and frontier models like GPT-4o, GPT-5, and gpt-oss-120B.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines include very recent 2025 RL-trained models (J1, RM-R1, StepWiser) and frontier models (GPT-5, gpt-oss-120B), all contemporary at time of publication.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Table 6 ablates proportion of direct judgment data (30-70%), continuous curriculum vs. random shuffling, and CoT retention strategy for the 20B model, quantifying each component's impact on pairwise and step-level benchmarks.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "The paper uses consistent accuracy for pairwise benchmarks, F1 for ProcessBench, Pearson correlation for single-rating tasks, and accuracy for VerifyBench, across 7 core benchmarks and 3 downstream settings.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "The paper trains and evaluates automated evaluators on automated benchmarks; human evaluation of FARE outputs is not conducted and is clearly not relevant to this benchmarking paradigm.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "All evaluation is on held-out test benchmarks (JudgeBench, ProcessBench, VerifyBench, etc.) separate from training data, with explicit N-gram decontamination applied.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "ProcessBench results are broken down by difficulty (GSM8K, MATH, OlympiadBench, OmniMATH); CodingJudgeBench by task type; JETTS provides per-generator and per-benchmark breakdowns in Table 10.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section D.6 explicitly notes FARE-8B fails to improve larger generators on harder benchmarks in JETTS; D.2 shows removing CoT from FARE-20B degrades most benchmark scores; Table 4 shows SC hurts MBPP+.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Negative results include: self-consistency degrades FARE performance on MBPP+, removing CoT from FARE-20B reduces most benchmark scores, and FARE-8B cannot universally improve generator performance in reranking.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Base models are specifically identified as Qwen3-8B-Base and gpt-oss-20B with arXiv citations; all 12 generator models for synthetic data are enumerated by name and model family in Appendix B.1.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Appendix E.1 provides full verbatim prompts for pairwise evaluation, direct judgment pairwise, step-level evaluation, and reference-based verification, with all placeholder variables identified.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Training hyperparameters are reported throughout: batch size 128, learning rate 1e-6, rollout batch sizes 50K/250K, K=4 rollout samples at temperature 0.9, and KL coefficient 0.001 for GRPO experiments.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "This paper trains evaluator models without agentic scaffolding; no agentic framework is used in the experimental setup.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Appendix B.1 describes N-gram decontamination, hand-crafted rubric creation per dataset, programmatic error injection details, and the generate-then-grade procedure with temperature sampling specifics.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "The 2.5M training samples are not released; only Table 5 listing source datasets is provided, making independent verification of the curated dataset impossible.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3.1 and Appendix B.1 describe both existing data collection (sources, rubric creation) and synthetic data generation (programmatic error injection and generate-then-grade) in substantial detail with Table 5 enumerating all 24 source datasets.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants; all data comes from existing public datasets and automated synthesis pipelines.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline from seed datasets through rubric creation, response generation (12 generators), correctness grading, N-gram decontamination, and final dataset composition is documented across Section 3.1 and Appendix B.1.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "Training cutoffs for the base models (Qwen3-8B-Base, gpt-oss-20B) are not stated, making it unclear whether benchmark examples were available during base model pre-training.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": true,
    303           "justification": "Appendix B.1 explicitly states they applied N-gram matching decontamination following Guha et al. (2025) to remove fine-tuning training samples overlapping with evaluation benchmarks.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": true,
    309           "justification": "The paper explicitly addresses potential benchmark contamination through N-gram matching decontamination of training sets and focuses on modern (2024+) datasets to reduce temporal overlap.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants; pre-registration is not applicable.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants; IRB/ethics approval is not applicable.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "The paper discusses efficiency as a design goal and compares model sizes/active parameters, but reports no specific inference latency, throughput, or cost numbers.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Training details (batch size, rollout batch size, steps) are provided but total GPU-hours or compute budget for training FARE-8B or FARE-20B is not disclosed.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "FARE-8B outperforms RL-trained evaluators of comparable or larger size on JudgeBench",
    376       "evidence": "Table 1 shows FARE-8B scores 55.71 on JudgeBench vs J1-8B (42.00) and RM-R1-14B (46.86), a 13.71 and 8.85 point margin respectively",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "FARE-20B sets a new standard for open-source evaluators, surpassing specialized 70B+ models",
    381       "evidence": "Table 1 shows FARE-20B (64.29 JudgeBench, 74.4 PPE) outperforming EvalPlanner-70B (56.60, 70.2) and J1-70B (60.00, 72.8) despite 3.5x fewer total and ~20x fewer active parameters",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "FARE-20B achieves near-oracle inference-time reranking performance on MATH",
    386       "evidence": "Figure 3 shows FARE-20B approaching the oracle green line on MATH across multiple generators, outperforming SFR-Judge-70B by 14 points and Skywork-Critic-70B by 21 points on Llama-3.1-8B generator",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Using FARE-20B as verifiers in GRPO training improves downstream model performance by 14.1% over string-matching verifiers",
    391       "evidence": "Figure 4 shows Qwen2.5-7B-Base trained with FARE-20B verifier reaches 45.2 vs 39.6 (string matching); the 14.1% figure is relative improvement, single run without variance",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Continual finetuning of FARE-20B for code with only 15K samples (FARE-20B-Code) outperforms gpt-oss-120B on average",
    396       "evidence": "Figure 5 shows FARE-20B-Code average consistent accuracy exceeds gpt-oss-120B across three CodingJudgeBench tasks, with 10.48 point gain on test-case quality over FARE-20B",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Large-scale RS-SFT without RL is competitive with RL-trained specialized evaluators",
    401       "evidence": "FARE-8B and FARE-20B trained with rejection sampling SFT outperform RL-trained models (J1, RM-R1, StepWiser) on most benchmarks in Tables 1-3",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Positional robustness in pairwise evaluation emerges as a function of training data scale",
    406       "evidence": "Figure 6 shows pairwise consistency increasing monotonically from ~65% to ~80% as training samples increase from 0 to 2.5M for both Qwen3 and Qwen2.5 initializations",
    407       "supported": "moderate"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval"
    412   ],
    413   "key_findings": "FARE demonstrates that scaling training data to 2.5M multi-task, multi-domain samples with iterative rejection sampling SFT achieves state-of-the-art performance for generative evaluators without computationally expensive RL training. FARE-8B at 8B parameters matches or exceeds specialized RL-trained evaluators at 14B+ parameters on reasoning benchmarks, while FARE-20B with 3.6B active parameters outperforms dense 70B+ specialized judges across 7 benchmarks. In downstream applications, FARE-20B achieves near-oracle best-of-10 reranking on MATH and yields 14.1% relative improvement over string-matching verifiers in GRPO RL training. An additional finding is that positional robustness emerges naturally with data scale, suggesting data-driven training can mitigate common evaluator biases without targeted interventions.",
    414   "red_flags": [
    415     {
    416       "flag": "No statistical testing",
    417       "detail": "All comparative claims are made on single-run point estimates without confidence intervals, error bars, or significance tests, making it impossible to determine if performance differences are reliable or within noise."
    418     },
    419     {
    420       "flag": "No code or training data release",
    421       "detail": "Neither the training pipeline code nor the 2.5M curated training samples are released, making reproduction effectively impossible despite the hyperparameter details provided."
    422     },
    423     {
    424       "flag": "Self-evaluation only",
    425       "detail": "All evaluations are conducted by the Salesforce team that developed FARE with no independent evaluation by external parties."
    426     },
    427     {
    428       "flag": "No compute budget disclosed",
    429       "detail": "Total GPU-hours or compute cost for training FARE-8B and FARE-20B is not reported, preventing assessment of practical reproducibility or cost-effectiveness."
    430     },
    431     {
    432       "flag": "Base model contamination unaddressed",
    433       "detail": "Training cutoffs for base models (Qwen3-8B-Base, gpt-oss-20B) are not stated; these models' pretraining data may overlap with evaluation benchmarks in ways the fine-tuning-level N-gram decontamination cannot address."
    434     },
    435     {
    436       "flag": "No limitations section",
    437       "detail": "The paper has no dedicated limitations section; scope boundaries regarding language, domain coverage, model scale, and benchmark generalization are not explicitly stated."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Foundational Autoraters: Taming Large Language Models for Better Automatic Evaluation",
    443       "relevance": "Direct precursor introducing the foundational evaluator training paradigm; FARE extends this with larger data scale and iterative training"
    444     },
    445     {
    446       "title": "Direct Judgement Preference Optimization",
    447       "relevance": "Related multi-task foundational evaluator using direct judgment data; key methodological comparison and baseline"
    448     },
    449     {
    450       "title": "Self-Taught Evaluators",
    451       "relevance": "Related iterative SFT approach for training evaluators; contrasted with FARE in terms of data scale, task coverage, and training stability"
    452     },
    453     {
    454       "title": "J1: Incentivizing Thinking in LLM-as-a-Judge via Reinforcement Learning",
    455       "relevance": "Key RL-trained evaluator baseline that FARE claims to match or outperform despite simpler training methodology"
    456     },
    457     {
    458       "title": "RM-R1: Reward Modeling as Reasoning",
    459       "relevance": "RL-trained evaluator baseline; FARE-8B outperforms RM-R1-14B on most benchmarks, supporting the data-scaling argument"
    460     },
    461     {
    462       "title": "JudgeBench: A Benchmark for Evaluating LLM-based Judges",
    463       "relevance": "Primary pairwise reasoning evaluation benchmark used throughout; introduces consistent accuracy metric adopted by this paper"
    464     },
    465     {
    466       "title": "ProcessBench: Identifying Process Errors in Mathematical Reasoning",
    467       "relevance": "Step-level evaluation benchmark where FARE-20B achieves state-of-the-art performance, matching GPT-5"
    468     },
    469     {
    470       "title": "Evaluating Judges as Evaluators: The JETTS Benchmark of LLM-as-Judges as Test-Time Scaling Evaluators",
    471       "relevance": "Framework for downstream inference-time scaling evaluation; used to assess FARE as a best-of-N reranker across multiple generators and tasks"
    472     },
    473     {
    474       "title": "General-Reasoner: Advancing LLM Reasoning Across All Domains",
    475       "relevance": "Provides the WebInstruct-Verified training setup and General-Verifier baseline for GRPO training experiments; FARE-20B verifier is compared against their approach"
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 3,
    481       "justification": "FARE directly addresses high-demand infrastructure needs for scalable evaluators in RL training and inference-time scaling, with demonstrated practical gains in both settings using off-the-shelf training techniques."
    482     },
    483     "surprise_contrarian": {
    484       "score": 2,
    485       "justification": "Challenges the dominant narrative that RL training is necessary for state-of-the-art evaluators, showing simple data scaling with RS-SFT matches or beats RL-trained models at far lower compute cost."
    486     },
    487     "fear_safety": {
    488       "score": 0,
    489       "justification": "No AI safety concerns are raised; the paper is a systems/ML engineering contribution about training better automated evaluators."
    490     },
    491     "drama_conflict": {
    492       "score": 1,
    493       "justification": "Implicitly critiques the recent trend toward RL-based evaluator training as unnecessary complexity, but frames this as a finding rather than a confrontational argument."
    494     },
    495     "demo_ability": {
    496       "score": 1,
    497       "justification": "No model weights release or demo link is provided in the paper text; the models may be available but are not publicized in this preprint."
    498     },
    499     "brand_recognition": {
    500       "score": 2,
    501       "justification": "Salesforce AI Research is a recognized industrial AI lab; the paper benchmarks against and claims to outperform OpenAI's GPT-5 on several evaluation tasks."
    502     }
    503   },
    504   "hn_data": {
    505     "threads": [
    506       {
    507         "hn_id": "45657595",
    508         "title": "Binary Retrieval-Augmented Reward Mitigates Hallucinations",
    509         "points": 44,
    510         "comments": 3,
    511         "url": "https://news.ycombinator.com/item?id=45657595",
    512         "created_at": "2025-10-21T16:14:28Z"
    513       },
    514       {
    515         "hn_id": "42984225",
    516         "title": "Leveraging Multimodal LLM for Inspirational User Interface Search",
    517         "points": 2,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=42984225",
    520         "created_at": "2025-02-08T16:52:28Z"
    521       },
    522       {
    523         "hn_id": "45876369",
    524         "title": "Diagnosing Representation Dynamics in NER Model Extension",
    525         "points": 1,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=45876369",
    528         "created_at": "2025-11-10T14:30:09Z"
    529       }
    530     ],
    531     "top_points": 44,
    532     "total_points": 47,
    533     "total_comments": 3
    534   }
    535 }

Impressum · Datenschutz