scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30121B)
      1 {
      2   "paper": {
      3     "title": "Reliable LLM-Based Edge-Cloud-Expert Cascades for Telecom Knowledge Systems",
      4     "authors": [
      5       "Qiushuo Hou",
      6       "Sangwoo Park",
      7       "Matteo Zecchin",
      8       "Yunlong Cai",
      9       "Guanding Yu",
     10       "Osvaldo Simeone",
     11       "Tommaso Melodia"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv (eess.SP)",
     15     "arxiv_id": "2512.20012",
     16     "doi": "10.48550/arXiv.2512.20012"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval", "theoretical"],
     21   "key_findings": "The paper proposes MHT-ERM, a statistically principled threshold selection method for cascaded edge-cloud-expert LLM systems that provides finite-sample guarantees on misalignment risk via multiple hypothesis testing with Bonferroni correction. On the TeleQnA telecom benchmark, MHT-ERM achieves lower cost than human-only decisions while maintaining alignment constraints that conventional ERM violates, especially with small calibration sets. White-box Bayesian uncertainty estimation outperforms black-box prompt-based approaches for routing decisions.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Sec V states: 'The code to reproduce all the results in the following is available at https://github.com/qiushuo0913/reliable_LLM.' A working URL is provided."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The TeleQnA dataset is publicly available and open-sourced at https://github.com/netop-team/TeleQnA, as stated in Sec V-A footnote."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions 'a single A100 GPU' (Sec V-B) but does not provide library versions, requirements.txt, Dockerfile, or detailed environment specifications."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. The code repository is linked but the paper itself contains no README-style instructions or 'Reproducing Results' section."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Standard deviations are reported across 200 independent experiments (Figs 4-7 show error bars). Fig 3 shows box plots with interquartile ranges. The 95th percentile (1-δ quantile) is marked as a horizontal line."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No statistical significance tests (p-values, t-tests, etc.) are used to compare methods. Claims that MHT-ERM outperforms baselines are based on visual comparison of means and box plots, not formal statistical tests between methods."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Effect sizes are provided with baseline context. For example, cloud model accuracy 'increases from 70.4% to 71.6%' (Sec V-E), misalignment values like '0.4 > α = 0.3' for C-ERM (Sec V-D), and cost differences are shown with absolute values across conditions."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The calibration dataset size N=100 and test set size Ntest=1000 are stated but not justified. While the paper investigates the impact of calibration data size (Fig 4), no power analysis or justification for these specific choices is provided."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Results are averaged over 200 independent experiments with standard deviations reported (Figs 4-7 show 'shaded bar on plots shows one standard deviation on both sides'). Box plots in Fig 3 show full distributional information."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Five baselines are included: edge-only, cloud-only, human-only, C-ERM (conventional ERM with grid search), and MHT-ERM-B (Bonferroni correction without exploiting monotonicity). Described in Sec V-C."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "C-ERM represents the standard empirical risk minimization approach for threshold optimization, and MHT-ERM-B is an ablated variant of the proposed method. The single-tier baselines (edge-only, cloud-only, human-only) provide appropriate reference points. These are appropriate for the contribution type."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper systematically varies: (1) scoring method (white-box Bayesian vs black-box prompt-based, Fig 3), (2) calibration data size (Fig 4), (3) misalignment target α (Fig 5), (4) grid size (Fig 6), and (5) reasoning budget (Fig 7). MHT-ERM vs MHT-ERM-B directly ablates the monotonicity-exploiting testing strategy."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Two primary metrics are used: misalignment rate (alignment with expert judgments) and system cost (computational/human resource cost). Both are reported for all experiments."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All evaluation is automated using the TeleQnA benchmark with known correct answers. No human evaluation of system outputs is performed. For a system designed to support human decision-making, human evaluation of output quality would be relevant."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Explicit three-way separation: validation set Dval (100 examples for Bayesian posterior), calibration set D (N=100 for threshold optimization), and test set Dtest (Ntest=1000 for evaluation). Sec V-B states these are i.i.d. and disjoint."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "The TeleQnA dataset has 5 categories (research publications 45%, standards specifications 20%, research overview 20%, standards overview 10%, lexicon 5%) described in Sec V-A, but no per-category results are reported. All results are aggregate metrics."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The paper shows where C-ERM fails the alignment constraint but does not discuss failure cases of MHT-ERM itself — e.g., what types of queries it misroutes, or when the cost overhead is excessive."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "Every experiment shows MHT-ERM meeting the alignment constraint and achieving lower cost than conservative alternatives. No experiments where MHT-ERM underperforms or fails are shown. Negative results shown are only for baselines, not for the proposed method."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims 'superior cost-efficiency compared to conventional cascaded baselines, while ensuring reliability at prescribed confidence levels.' Figs 3-7 confirm MHT-ERM achieves lower cost than human-only while meeting the α constraint, and outperforms C-ERM in reliability."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper's main claims are theoretical (Proposition 1 with proof) rather than purely empirical. Comparative claims are based on controlled experiments varying one factor at a time (calibration size, α, grid size, scoring method). The ablation MHT-ERM vs MHT-ERM-B isolates the effect of the sequential testing strategy."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper consistently frames results as applying to 'telecommunications knowledge systems' and the TeleQnA benchmark specifically. The title, abstract, and all claims are bounded to the telecom domain. Sec II-A notes 'While the applicability of the methodology is broader, we adopt as a running example an automatic expert system for telecommunications networks.'"
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No alternative explanations for the observed performance differences are considered. For example, the paper does not discuss whether the cost model (linear in parameters) might favor certain configurations, or whether the performance advantages are specific to TeleQnA's structure."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper explicitly defines misalignment as agreement with expert answers on TeleQnA (Eq. 7) and measures exactly that. Claims match measurement granularity — 'alignment of automated answers with expert judgments' — without overclaiming broader reliability in deployed systems."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Exact model versions are specified: Qwen2-1.5B-instruct, Qwen2-7B-instruct, and Qwen3-4B. HuggingFace URLs are provided in references [34] and [35] (e.g., https://huggingface.co/Qwen/Qwen2-1.5B-Instruct)."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full prompt templates are provided in Appendix A for both Qwen2-1.5B (two-stage: answer generation + confidence evaluation) and Qwen3-4B (with test-time scaling). The actual prompt text used for both stages is shown verbatim."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Comprehensive hyperparameters reported in Sec V-B: N=100, M=5, Q=100, α=0.3, δ=0.05, Ntest=1000, K=10 prompt permutations, Dval=100, Ledge=1.5, Lcloud=7 (Qwen2-7B) or 4 (Qwen3-4B), Lhuman=10."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The system is a cascading/routing mechanism for query processing, not an agent with tools, memory, or feedback loops."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Sec V-A describes TeleQnA structure (10,000 MC questions, 5 categories with distribution percentages). Sec V-B documents the data partitioning into calibration (N=100), validation (100), and test (1000) sets, with i.i.d. sampling from Pxy. The 200-experiment averaging procedure is stated."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated limitations section exists. Sec VI (Conclusions) mentions future research directions (adaptive thresholds, multi-objective optimization, limited calibration datasets) in a single paragraph, but this is future work, not a substantive discussion of current limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the synthetic cost model, the reliance on a single benchmark, or the assumption that TeleQnA is representative of real-world telecom queries."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show — e.g., that results are limited to multiple-choice QA, that the cost model is relative rather than empirical, or that real-world deployment may differ from the i.i.d. assumption."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The TeleQnA dataset is publicly available at https://github.com/netop-team/TeleQnA and the experimental code at https://github.com/qiushuo0913/reliable_LLM. Raw data can be independently verified."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Sec V-A describes TeleQnA: 10,000 multiple-choice questions across 5 categories with specified proportions, sourced from 3GPP, IEEE, ITU standards, research publications, and telecom lexicon materials. The benchmark source [27] is referenced."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. The data source is the standard TeleQnA benchmark dataset."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline is documented: TeleQnA questions → partition into calibration/validation/test → compute uncertainty and confidence scores → threshold optimization via MHT-ERM → evaluation on test set. The 200-experiment averaging with independent data splits is described in Sec V-B."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funding is disclosed in the author information section: EPSRC Open Fellowship (EP/W024101/1), EPSRC project (EP/X011852/1) for Zecchin and Simeone, and NSF award CNS-2112471 for Melodia."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All author affiliations are clearly listed: Zhejiang University, King's College London (KCLIP lab, CIIPS), and Northeastern University (INSI). No evaluated product is affiliated with any author's institution."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Funders are EPSRC (UK Research and Innovation) and NSF (US National Science Foundation), which are independent government funding agencies with no financial stake in the experimental outcomes."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is included in the paper. Absence of a disclosure statement does not confirm absence of conflicts."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper does not state training data cutoff dates for Qwen2-1.5B-instruct, Qwen2-7B-instruct, or Qwen3-4B. These models are used to evaluate on TeleQnA but temporal overlap is not addressed."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether TeleQnA questions or answers appeared in the Qwen models' training data. The benchmark draws from publicly available standards documents and research publications that could plausibly be in training corpora."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "TeleQnA was published in 2025 (IEEE Network, early access). Qwen2 models were released in 2024. No analysis of whether benchmark content was available online before model training cutoffs."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Inference costs are central to the paper. Relative costs are defined: Ledge=1.5, Lcloud=7 (Qwen2-7B) or 4 (Qwen3-4B), Lhuman=10 (Sec V-B). System cost is a primary evaluation metric reported in all experiments. Hardware: single A100 GPU."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Hardware is stated (single A100 GPU) but total GPU hours, wall-clock time, or actual dollar cost are not reported. Running 200 independent experiments across multiple LLMs represents non-trivial compute that is not quantified."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "Results are averaged over 200 independent experiments (Sec V-B) with standard deviations reported. The randomness comes from calibration dataset sampling, which is the primary source of variance in the method. Figs 3-7 show distributional information."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Sec V-B explicitly states: 'all the results in this section are reported after averaging over 200 independent experiments.'"
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The grid search (M=5, Q=100) is part of the method, but the paper does not report how meta-parameters like α=0.3, δ=0.05, cost ratios, or grid sizes were selected. No search budget for these choices is stated."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Configuration selection is the method itself: MHT-ERM selects the optimal threshold pair by minimizing empirical cost over the statistically reliable subset Φ* (Eq. 25). The selection criterion is mathematically justified and proven (Proposition 1, Corollary 1)."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Multiple comparison correction is the core contribution. The paper uses Bonferroni correction across M parallel testing sequences (Eq. 22), with the sequential testing strategy exploiting monotonicity to reduce conservatism compared to global Bonferroni (MHT-ERM-B)."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors implement and evaluate their own method (MHT-ERM) and all baselines (C-ERM, MHT-ERM-B) without acknowledging the bias of evaluating their own system. No independent evaluation or acknowledgment of this bias."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper does not report performance as a function of actual compute budget. While cost is a metric, it uses synthetic relative units (1.5, 4, 7, 10) rather than actual compute. The compute difference between MHT-ERM and C-ERM (both run grid search) is not discussed."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper uses TeleQnA without questioning whether multiple-choice QA accuracy is a valid measure of telecom knowledge system reliability. No discussion of construct validity or comparison with alternative evaluation approaches."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. The system is a routing/cascading mechanism, not an agent scaffold."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of temporal leakage. TeleQnA draws from publicly available telecom standards and research publications. Whether these sources were in Qwen training data is not addressed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information. The multiple-choice format provides answer options as input, which could affect model confidence scores differently than real-world open-ended queries."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "The paper assumes i.i.d. sampling for calibration and test sets but does not verify whether questions within TeleQnA are truly independent (e.g., multiple questions from the same standard document may share knowledge patterns)."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "MHT-ERM satisfies the alignment constraint (misalignment rate ≤ α) with probability at least 1-δ, providing finite-sample guarantees.",
    373       "evidence": "Proposition 1 and Corollary 1 (Sec IV-C) with formal proof in Appendix B. Experimentally verified across all settings in Figs 3-7 where the 95th percentile of misalignment stays below α.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "MHT-ERM achieves superior cost-efficiency compared to conventional cascaded baselines while maintaining reliability.",
    378       "evidence": "Figs 3-7 show MHT-ERM achieves lower cost than human-only and MHT-ERM-B while satisfying the alignment constraint. C-ERM achieves similar cost but violates the alignment constraint, especially with small calibration sets (Fig 4) and practical α values (Fig 5).",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "White-box Bayesian uncertainty estimation achieves lower misalignment rates than black-box prompt-based inference.",
    383       "evidence": "Fig 3(a) vs 3(b) comparison shows white-box methods achieve tighter misalignment distributions. Attributed to 'direct access to model logits' enabling 'more precise uncertainty estimation' (Sec V-D).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "C-ERM violates the alignment constraint when calibration data is limited (N < 200) or when the misalignment target is strict (α < 0.5).",
    388       "evidence": "Fig 4 shows C-ERM exceeds α=0.3 when N=10 (misalignment ~0.4). Fig 5 shows C-ERM violates the constraint for practical α values. However, the comparison lacks statistical tests.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Reasoning-enhanced cloud deployment (Qwen3-4B with thinking tokens) improves alignment with marginal cost increase.",
    393       "evidence": "Fig 7 shows cloud model accuracy increases from 70.4% to 71.6% with increased thinking budget, and MHT-ERM maintains the alignment constraint throughout. However, the accuracy improvement is small (1.2 percentage points).",
    394       "supported": "weak"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "No contamination analysis",
    400       "detail": "The paper evaluates Qwen2 and Qwen3 models on TeleQnA without any analysis of whether benchmark content (sourced from publicly available telecom standards) was in the models' training data. Training cutoff dates are not stated."
    401     },
    402     {
    403       "flag": "No per-category results despite categorical data",
    404       "detail": "TeleQnA has 5 distinct categories with very different difficulty levels (lexicon vs. standards specifications), but all results are reported as aggregates. The routing behavior across categories would be highly informative but is not analyzed."
    405     },
    406     {
    407       "flag": "Synthetic cost model",
    408       "detail": "The cost model uses relative units proportional to parameter count (Ledge=1.5, Lcloud=7, Lhuman=10) rather than measured wall-clock time, dollar cost, or latency. The chosen cost ratios directly affect which method appears optimal but are not validated against real deployment costs."
    409     },
    410     {
    411       "flag": "No limitations discussion",
    412       "detail": "The paper has no dedicated limitations section. Key unaddressed limitations include: single benchmark evaluation, i.i.d. assumption in non-stationary telecom environments, and the gap between multiple-choice QA and real-world telecom decision support."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    418       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    419       "year": 2023,
    420       "arxiv_id": "2305.05176",
    421       "relevance": "Foundational work on LLM routing and cost-reduction through dynamic query routing between smaller and larger models."
    422     },
    423     {
    424       "title": "SpecInfer: Accelerating large language model serving with tree-based speculative inference and verification",
    425       "authors": ["Xupeng Miao", "Gabriele Oliaro"],
    426       "year": 2024,
    427       "relevance": "Speculative execution approach for LLM serving using small draft models verified by larger models, relevant to cascaded LLM architectures."
    428     },
    429     {
    430       "title": "Cascadia: A cascade serving system for large language models",
    431       "authors": ["Yuxuan Jiang", "Fangcheng Fu"],
    432       "year": 2025,
    433       "arxiv_id": "2506.04203",
    434       "relevance": "Confidence-based routing across multiple LLM APIs for cost-efficient serving, directly related to LLM cascade frameworks."
    435     },
    436     {
    437       "title": "Trust or escalate: LLM judges with provable guarantees for human agreement",
    438       "authors": ["Jaehun Jung", "Faeze Brahman", "Yejin Choi"],
    439       "year": 2025,
    440       "relevance": "LLM-based judging with provable statistical guarantees on human agreement, closely related to the learn-then-test reliability framework used in this paper."
    441     },
    442     {
    443       "title": "Learn then test: Calibrating predictive algorithms to achieve risk control",
    444       "authors": ["Anastasios N. Angelopoulos", "Stephen Bates", "Emmanuel J. Candès", "Michael I. Jordan", "Lihua Lei"],
    445       "year": 2025,
    446       "relevance": "The foundational LTT framework for finite-sample risk control via multiple hypothesis testing, which this paper extends to cascaded LLM systems."
    447     },
    448     {
    449       "title": "Towards a cascaded LLM framework for cost-effective human-AI decision-making",
    450       "authors": ["Claudio Fanconi", "Mihaela van der Schaar"],
    451       "year": 2025,
    452       "arxiv_id": "2506.11887",
    453       "relevance": "Cascaded LLM framework for human-AI collaboration with Bayesian uncertainty estimation, a primary baseline and inspiration for this work."
    454     },
    455     {
    456       "title": "Overconfidence in LLM-as-a-Judge: Diagnosis and confidence-driven solution",
    457       "authors": ["Zhe Tian", "Zihang Han"],
    458       "year": 2025,
    459       "arxiv_id": "2508.06225",
    460       "relevance": "Documents overconfidence in LLM judges and proposes confidence-driven solutions, relevant to LLM uncertainty estimation and reliability."
    461     },
    462     {
    463       "title": "To believe or not to believe your LLM: Iterative prompting for estimating epistemic uncertainty",
    464       "authors": ["Yasin Abbasi Yadkori", "Ilja Kuzborskij"],
    465       "year": 2024,
    466       "relevance": "Iterative prompting approach for LLM epistemic uncertainty estimation, relevant to the black-box uncertainty methods used in this paper."
    467     },
    468     {
    469       "title": "Can LLMs express their uncertainty? An empirical evaluation of confidence elicitation in LLMs",
    470       "authors": ["Miao Xiong", "Zhiyuan Hu"],
    471       "year": 2024,
    472       "relevance": "Empirical evaluation of LLM confidence elicitation methods, relevant to the self-confidence scoring approach used in this paper."
    473     },
    474     {
    475       "title": "Rational tuning of LLM cascades via probabilistic modeling",
    476       "authors": ["Michael J. Zellinger", "Matt Thomson"],
    477       "year": 2025,
    478       "arxiv_id": "2501.09345",
    479       "relevance": "Probabilistic approach to tuning LLM cascades, directly related to the cascade optimization problem addressed in this paper."
    480     },
    481     {
    482       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    483       "authors": ["Patrick Lewis", "Ethan Perez"],
    484       "year": 2020,
    485       "relevance": "Foundational RAG paper relevant to LLM-based knowledge systems and question-answering pipelines."
    486     },
    487     {
    488       "title": "Thought calibration: Efficient and confident test-time scaling",
    489       "authors": ["Mike Wu", "Carolyn Zhou", "Stephen Bates", "Tommi Jaakkola"],
    490       "year": 2025,
    491       "arxiv_id": "2505.18404",
    492       "relevance": "Test-time scaling with calibrated confidence, relevant to the reasoning-enhanced cloud deployment scenario in this paper."
    493     }
    494   ]
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs