ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27009B)


      1 {
      2   "paper": {
      3     "title": "On Calibration of Large Language Models: From Response To Capability",
      4     "authors": ["Sin-Han Yang", "Cheng-Kuang Wu", "Chieh-Yen Lin", "Yun-Nung Chen", "Hung-yi Lee", "Shao-Hua Sun"],
      5     "year": 2026,
      6     "venue": "Preprint",
      7     "arxiv_id": "2602.13540"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'Source code: https://github.com/appier-research/llm-calibration' providing a GitHub URL for the code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available datasets: TriviaQA, SimpleQA, GSM8K, MATH-500, AIME25, MMLU, and GPQA. All are standard public benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper specifies model names and sampling hyperparameters (Appendix B.2) and probe training hyperparameters (Table 5), but does not provide a requirements.txt, Dockerfile, or detailed environment setup section listing library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the paper provides a GitHub link and detailed descriptions of methods and hyperparameters, there are no explicit step-by-step reproduction instructions (e.g., README with commands) within the paper itself. The code repository may contain them, but the paper does not."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Figure 6 shows 95% confidence intervals for the expected accuracy estimation under different keval values. The pass@k simulation in Figure 9 also shows confidence interval bands. However, the main results in Table 2 report only point estimates without uncertainty."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., 'probing consistently outperforms the random baseline', 'Probe-MATH outperforms Oracle-RC') but relies solely on comparing Brier score values without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results are reported as raw Brier scores and MSE values. No standardized effect sizes (Cohen's d, relative improvement with baseline context) are reported. The reader can compute relative differences from the tables, but the paper does not contextualize the magnitude of improvements."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 4.3.1 and Appendix B.1 provide a thorough justification for the choice of keval=100, including theoretical analysis of binomial variance, margin of error calculations, and empirical sensitivity analysis (Figure 6)."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Main results in Table 2 report single Brier score values without variance, standard deviation, or confidence intervals across runs. There is no mention of running experiments with multiple random seeds or reporting spread measures for the main calibration results."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper includes a uniform random baseline with analytical expected loss (Section 4.2), and compares against verbalized confidence, P(True), and response consistency methods (Table 2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include recent methods: verbalized confidence (Tian et al., 2023), P(True) (Kadavath et al., 2022), response consistency (Wang et al., 2022), and probing (Li et al., 2021). The paper adapts these to the capability calibration setting and uses recent models (Olmo-3, Qwen3, gpt-oss-20b)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper conducts multiple ablation-like studies: effect of keval (Appendix B.1), effect of kc for response consistency (Appendix C.3, Table 6), effect of training dataset domain on probe generalization (Table 2, Table 7), and mixed vs. single-dataset probes (Appendix C.4)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses Brier score as the primary calibration metric, but also evaluates downstream application performance via MSE for pass@k simulation (Table 3) and success rate for inference budget allocation (Figure 5). The paper explains why it chose Brier score over ECE (Section 4.1)."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a calibration framework paper evaluating confidence estimation methods on standard benchmarks. Human evaluation of system outputs is not relevant to the claims about calibration quality."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "For probing experiments, the paper uses separate training, validation, and test splits. For MATH, 'we use MATH-500 as the test set, and the remaining 12,000 instances as the training and validation sets' (Appendix C.2). Out-of-distribution generalization is tested on entirely separate datasets."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-dataset breakdowns across 7 datasets for all methods and 3 models. Results are broken down by domain (factual knowledge, mathematical reasoning, general exams) and by in-domain vs. out-of-domain generalization."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses when probing fails to generalize to out-of-domain settings (Section 4.3.2), when verbalized confidence fails (worse than random baseline for some models, Table 2), and when Probe-MATH does not fit well for pass@k simulation (Figure 9, Appendix D.1.2)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that 'Olmo-3-7B-Instruct and Qwen3-8B do not even consistently outperform the random uniform baseline with verbalized confidence' (Section 4.3.2). Probing does not consistently generalize to out-of-domain settings. 2-layer MLP probes were tried but 'did not differ from linear probes' (Appendix C.2)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that capability calibration differs from response calibration theoretically and empirically (supported by Theorems 1-2 and Figure 2), that capability-calibrated confidence improves pass@k prediction (supported by Table 3) and inference budget allocation (supported by Figure 5). All claims are supported by results in the paper."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims through ablation-style comparisons (e.g., changing the training dataset for probes, varying keval, varying kc). These are controlled single-variable manipulations. The theoretical claims (Theorems 1-2) are proved mathematically. The paper's main claims are about the distinction between definitions rather than causal improvement claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly tests and discusses generalization boundaries: in-domain in-distribution, in-domain out-of-distribution, and out-of-domain settings (Table 2, Section 4.3.2). It acknowledges that 'Developing generalizable methods for capability calibration remains an important direction for future work.' The title is appropriately scoped."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not substantively discuss alternative explanations for its results. For example, it does not consider whether the advantage of probing is due to memorizing dataset-specific patterns rather than capturing true capability, or whether the theoretical framework's assumptions (e.g., deterministic correctness function) limit its applicability. The Impact Statement section does not address these concerns."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'Olmo-3-7B-Instruct', 'Qwen3-8B', and 'gpt-oss-20b'. These are model family names with size designations but not specific version snapshots or API version identifiers. No snapshot dates or checkpoint hashes are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt text is provided for verbalized confidence (Figure 7) and P(True) (Figure 8) in the appendix. For probing, prompts are not needed as it operates on model activations. The prompts include exact wording with placeholders (e.g., '{question}') whose fill values are the benchmark questions."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Sampling hyperparameters are reported in Appendix B.2 (temperature, top-p for each model). Probe training hyperparameters are comprehensively reported in Table 5 (epochs, batch size, weight decay, learning rate, loss function, pooling method, feature standardization)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use agentic scaffolding. It evaluates confidence estimation methods applied to LLM outputs, not agent-based systems."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper documents data splits (Appendix C.2): TriviaQA and GSM8K use their training sets; MATH uses MATH-500 as test and remaining 12,000 as training/validation. Dataset selection criteria are described in Section 4.3.1 with clear rationale for each domain and difficulty level."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The paper does not have a dedicated limitations or threats-to-validity section. Future directions are mentioned briefly in Section 6 (Conclusion), but there is no substantive discussion of limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The paper acknowledges that probing does not generalize to out-of-domain settings (Section 4.3.2) and that Probe-MATH does not fit well for pass@k simulation (Appendix D.1.2), but these are presented as results rather than as threats to the validity of the overall framework."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. Section 5.3 mentions that 'a comprehensive empirical evaluation of all downstream tasks is beyond the scope of this work,' but this is the only scope boundary discussed. No discussion of what populations, task types, or model families the results cannot generalize to."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "All datasets used are publicly available benchmarks (TriviaQA, SimpleQA, GSM8K, MATH-500, AIME25, MMLU, GPQA). Source code is released at the provided GitHub link, which could contain generated model outputs for verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.3.1 describes the data collection procedure: datasets selected from three task domains (factual knowledge, mathematical reasoning, general exams) with explicit justification for each choice. For capability calibration targets, keval=100 samples are drawn per query with detailed justification (Appendix B.1)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. Data sources are standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: sampling keval=100 responses per query, computing correctness via deterministic evaluator C, estimating expected accuracy, training probes on hidden states, and evaluating with Brier score. Appendix B.1, C.2, and the main paper describe each step."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgements section discloses funding: 'This work was supported in part by the National Science and Technology Council, Taiwan, under the Grant 114-2628-E-002-021-, and the Taiwan Centers of Excellence. Shao-Hua Sun was supported by the Yushan Fellow Program of the Ministry of Education, Taiwan.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: Appier AI Research and National Taiwan University. The affiliations are listed in the header of the paper."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders are the National Science and Technology Council of Taiwan and the Ministry of Education of Taiwan. These are government agencies with no direct financial interest in the outcome of calibration research. Authors from Appier could have a commercial interest, but the paper evaluates open models, not Appier products."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper. Some authors are affiliated with Appier (a commercial AI company), but no declaration of competing interests is provided."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates LLMs on benchmarks (TriviaQA, GSM8K, MATH-500, MMLU, GPQA, etc.) but does not state the training data cutoff dates for any of the three models used (Olmo-3-7B-Instruct, Qwen3-8B, gpt-oss-20b)."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper does not discuss whether the benchmark datasets (many of which are publicly available and widely used) may have been included in the training data of the evaluated models."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Several benchmarks used (TriviaQA from 2017, GSM8K from 2021, MMLU from 2020, MATH from 2021) were published well before the models' training. No contamination analysis is provided despite this risk."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Inference cost is a central dimension of the analysis. Figure 3 and 4 plot cost-performance tradeoffs. Table 2 lists relative cost per method. The paper discusses that probing costs 'less than decoding a single token' and response consistency 'costs more than decoding the response itself' (Sections 4.2, 4.3.2)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The total computational budget is not stated. The paper used keval=100 samples across 7 datasets for 3 models, which represents substantial compute, but no total GPU hours, API costs, or hardware specifications are provided. Appendix B.2 mentions 'limited computational budget' for Qwen3-8B but does not quantify it."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Capability calibration is theoretically and empirically distinct from response calibration.",
    286       "evidence": "Theorem 1 proves distinct optimal confidence estimators. Theorem 2 shows the loss decomposition with output correctness variance term. Figure 2 and Appendix E empirically confirm that response-level correctness does not reflect expected accuracy across models and datasets.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Linear probing on LLM hidden states provides the best cost-performance tradeoff for capability calibration.",
    291       "evidence": "Table 2 and Figures 3-4 show that probing consistently outperforms the random baseline at less-than-one-token inference cost. Probing achieves in-domain in-distribution Brier scores competitive with or better than other methods across all 3 models and 7 datasets.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Capability-calibrated confidence outperforms response-calibrated confidence for pass@k simulation.",
    296       "evidence": "Table 3 shows that Oracle-CC achieves near-zero MSE for pass@k simulation, while Oracle-RC's MSE increases with k. Probe-MATH outperforms Oracle-RC across all models and k values on MATH-500.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Capability-calibrated confidence improves inference budget allocation over uniform allocation.",
    301       "evidence": "Figure 5 and Appendix D.2.2 (Figure 10) show that greedy allocation using capability-calibrated confidence (Oracle, Probe-MATH, Verbalized) outperforms uniform allocation across compute budgets on MATH-500 and AIME25.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Probing generalizes reasonably well under in-domain out-of-distribution settings but not consistently to out-of-domain settings.",
    306       "evidence": "Table 2 shows that probes trained on TriviaQA generalize well to SimpleQA (same factual knowledge domain) across models, but performance on out-of-domain tasks (e.g., math probes on factual knowledge) is inconsistent.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Verbalized confidence effectiveness is model-dependent.",
    311       "evidence": "Table 2 shows gpt-oss-20b achieves best or second-best performance with verbalized confidence, while Olmo-3-7B-Instruct and Qwen3-8B do not consistently outperform the random baseline with the same method.",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "theoretical"],
    316   "key_findings": "The paper introduces capability calibration, which targets a model's expected accuracy on a query (rather than single-response correctness), and proves it is theoretically distinct from response calibration via a loss decomposition involving output correctness variance. Experiments across 3 LLMs and 7 datasets show that linear probing on model hidden states offers the best cost-performance tradeoff for capability calibration, achieving meaningful calibration at less than single-token inference cost. Capability-calibrated confidence is shown to improve pass@k prediction (outperforming oracle response calibration) and inference budget allocation (outperforming uniform allocation).",
    317   "red_flags": [
    318     {
    319       "flag": "No contamination analysis",
    320       "detail": "The paper evaluates models on benchmarks like TriviaQA (2017), GSM8K (2021), MMLU (2020), and MATH (2021) without discussing whether these datasets may be in the models' training data. Contamination could affect the expected accuracy estimates and invalidate the calibration measurements."
    321     },
    322     {
    323       "flag": "No significance testing",
    324       "detail": "All comparative claims are based on comparing raw Brier scores without statistical tests. Given the stochastic nature of the evaluation (sampling keval=100 responses per query), the differences between methods could be within noise."
    325     },
    326     {
    327       "flag": "No limitations section",
    328       "detail": "The paper lacks a dedicated limitations or threats-to-validity section, which is unusual for a venue-targeted ML paper. Key limitations (e.g., deterministic correctness function assumption, limited model diversity with only 3 open-weight models, limited task coverage) are not discussed."
    329     },
    330     {
    331       "flag": "Model version ambiguity",
    332       "detail": "Model versions are specified only by name and size (e.g., 'Olmo-3-7B-Instruct', 'gpt-oss-20b') without specific checkpoint identifiers or snapshot dates, making exact reproduction difficult."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "A survey of confidence estimation and calibration in large language models",
    338       "authors": ["J. Geng", "F. Cai", "Y. Wang", "H. Koeppl", "P. Nakov", "I. Gurevych"],
    339       "year": 2024,
    340       "relevance": "Comprehensive survey of LLM confidence estimation and calibration methods, directly relevant to understanding the state of the art this paper builds upon."
    341     },
    342     {
    343       "title": "Can llms express their uncertainty? an empirical evaluation of confidence elicitation in llms",
    344       "authors": ["M. Xiong", "Z. Hu", "X. Lu", "Y. Li", "J. Fu", "J. He", "B. Hooi"],
    345       "year": 2023,
    346       "arxiv_id": "2306.13063",
    347       "relevance": "Empirical study of confidence elicitation methods in LLMs, a key baseline approach that this paper extends to capability-level calibration."
    348     },
    349     {
    350       "title": "Language models (mostly) know what they know",
    351       "authors": ["S. Kadavath", "T. Conerly", "A. Askell"],
    352       "year": 2022,
    353       "arxiv_id": "2207.05221",
    354       "relevance": "Introduces the P(True) confidence estimation method evaluated as a baseline in this paper."
    355     },
    356     {
    357       "title": "Self-consistency improves chain of thought reasoning in language models",
    358       "authors": ["X. Wang", "J. Wei", "D. Schuurmans", "Q. Le", "E. Chi", "S. Narang", "A. Chowdhery", "D. Zhou"],
    359       "year": 2022,
    360       "arxiv_id": "2203.11171",
    361       "relevance": "Introduces self-consistency/response consistency as a confidence estimation strategy, evaluated as a baseline method."
    362     },
    363     {
    364       "title": "Learning how hard to think: Input-adaptive allocation of lm computation",
    365       "authors": ["M. Damani", "I. Shenfeld", "A. Peng", "A. Bobu", "J. Andreas"],
    366       "year": 2024,
    367       "arxiv_id": "2410.04707",
    368       "relevance": "Proposes inference budget allocation framework that directly leverages expected accuracy, used as a downstream application for capability calibration."
    369     },
    370     {
    371       "title": "Scaling llm test-time compute optimally can be more effective than scaling model parameters",
    372       "authors": ["C. Snell", "J. Lee", "K. Xu", "A. Kumar"],
    373       "year": 2024,
    374       "arxiv_id": "2408.03314",
    375       "relevance": "Studies optimal test-time compute scaling for LLMs, directly relevant to the inference budget allocation application."
    376     },
    377     {
    378       "title": "How do large language monkeys get their power (laws)?",
    379       "authors": ["R. Schaeffer", "J. Kazdan", "J. Hughes"],
    380       "year": 2025,
    381       "arxiv_id": "2502.17578",
    382       "relevance": "Studies pass@k scaling in LLMs and AI safety implications, directly motivating the pass@k simulation application."
    383     },
    384     {
    385       "title": "Efficient prediction of pass@k scaling in large language models",
    386       "authors": ["J. Kazdan", "R. Schaeffer", "Y. Allouah"],
    387       "year": 2025,
    388       "arxiv_id": "2510.05197",
    389       "relevance": "Proposes methods for predicting pass@k performance, a direct comparison point for the pass@k simulation application."
    390     },
    391     {
    392       "title": "On calibration of modern neural networks",
    393       "authors": ["C. Guo", "G. Pleiss", "Y. Sun", "K. Q. Weinberger"],
    394       "year": 2017,
    395       "relevance": "Foundational work on calibration of neural networks that establishes the response calibration framework this paper extends."
    396     },
    397     {
    398       "title": "Beyond the singular: The essential role of multiple generations in effective benchmark evaluation and analysis",
    399       "authors": ["W. Zhang", "H. Cai", "W. Chen"],
    400       "year": 2025,
    401       "arxiv_id": "2502.08943",
    402       "relevance": "Argues for using multiple generations in LLM evaluation, directly supporting the motivation for capability calibration."
    403     },
    404     {
    405       "title": "Evaluating large language models trained on code",
    406       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    407       "year": 2021,
    408       "arxiv_id": "2107.03374",
    409       "relevance": "Introduces pass@k evaluation methodology and unbiased estimator used in this paper's pass@k simulation experiments."
    410     },
    411     {
    412       "title": "RouteLLM: Learning to route llms with preference data",
    413       "authors": ["I. Ong", "A. Almahairi", "V. Wu"],
    414       "year": 2024,
    415       "arxiv_id": "2406.18665",
    416       "relevance": "Proposes LLM routing methods that could benefit from capability calibration as discussed in the paper's applications section."
    417     }
    418   ]
    419 }

Impressum · Datenschutz