scan.json (28296B)
1 { 2 "paper": { 3 "title": "IMMACULATE: A Practical LLM Auditing Framework via Verifiable Computation", 4 "authors": [ 5 "Yanpei Guo", 6 "Wenjie Qu", 7 "Linyu Wu", 8 "Shengfang Zhai", 9 "Lionel Z. Wang", 10 "Ming Xu", 11 "Yue Liu", 12 "Binhang Yuan", 13 "Dawn Song", 14 "Jiaheng Zhang" 15 ], 16 "year": 2026, 17 "venue": "arXiv", 18 "arxiv_id": "2602.22700" 19 }, 20 "scan_version": 2, 21 "active_modules": ["experimental_rigor", "data_leakage"], 22 "methodology_tags": ["benchmark-eval", "theoretical"], 23 "key_findings": "IMMACULATE detects economically motivated deviations in black-box LLM API services (model substitution, quantization abuse, token overbilling) using a novel Logit Distance Distribution (LDD) metric combined with randomized auditing and verifiable computation. Per-request detection rates range from 1.3% (FP8 quantization on Qwen3) to 99% (model substitution on LLaMA3), with false positive rates below 10⁻⁵, while imposing under 1% throughput overhead on benign providers. Token overreporting is formally proven to reduce to model substitution under their hybrid computation model.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper provides a GitHub URL in the abstract and Section 1: 'Our code is published at https://github.com/guo-yanpei/Immaculate.'" 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "Experiments use standard public benchmarks: GSM8K, TriviaQA, and WebQuestions (Section 6.1). These are publicly available datasets." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "Hardware is specified (NVIDIA RTX 6000 Pro, 96GB, tensor parallelism degree 2) and frameworks are named (vLLM, HuggingFace Transformers), but no requirements.txt, Dockerfile, library versions, or environment setup details are provided." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions, README description, or scripts for replicating the main experiments are described in the paper." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All results in Tables 2–5 are reported as point estimates. No confidence intervals, error bars, or ± notation appears anywhere in the paper." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper claims BF16 and FP8/substitution LDDs are 'well-separated' and 'highly discriminative' based on visual inspection of distributions and raw proportions. No statistical significance tests (t-tests, KS tests, etc.) are applied to formally compare distributions." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Table 2 reports proportions for benign vs malicious under multiple thresholds with clear baselines (e.g., LLaMA3 BF16 0.017% vs FP8 1.7% vs Substitution 11% at >0.1 threshold). Table 5 reports absolute overhead values. The magnitude of effects is clear from the tabulated comparisons." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper subsamples 500 prompts per dataset (1,500 total) without justifying this choice. No power analysis or rationale for sample size adequacy is provided." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "No standard deviation, variance across runs, or spread measures are reported. All results appear to be single-run point estimates." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 1 compares IMMACULATE against three paradigms: GPU TEE-based approaches, cryptographic approaches, and empirical methods across robustness, infrastructure-agnosticism, and efficiency. The evaluation also compares benign (BF16) vs malicious (FP8, substitution) deployments." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Comparisons reference recent work: Cai et al. (2025) for GPU TEE auditing, Sun et al. (2024, 2025a) for cryptographic and token overreporting approaches, and Gao et al. (2024) for empirical methods. These are contemporary and relevant." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": false, 88 "justification": "The system combines randomized auditing, LDD computation, and verifiable computation, but no ablation study isolates the contribution of individual components. The hyperparameter study (Table 4) varies thresholds but does not ablate system components." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "The evaluation reports detection rate, false positive rate (Table 3), tail probability proportions across multiple thresholds (Table 2), and throughput overhead (Table 5)." 94 }, 95 "human_evaluation": { 96 "applies": false, 97 "answer": false, 98 "justification": "Human evaluation is irrelevant for a cryptographic auditing framework. The system's output is a statistical accept/reject decision, not content requiring human judgment." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6.1: 'We randomly sample 200 prompts from each dataset to form an independent setup set for calibrating auditing thresholds; all remaining prompts are held out and used exclusively for evaluation.'" 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Results are broken down by model (LLaMA3-70B, Qwen3-32B, Qwen3-30B-A3B, DeepSeek-V2-Lite), by dataset (GSM8K, TriviaQA, WebQuestions in Table 3), and by attack type (BF16, FP8, substitution)." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": false, 113 "justification": "The paper does not discuss where IMMACULATE fails or breaks down. Low detection rates for certain configurations (e.g., Qwen3 FP8 at 1.3%) are reported but presented as acceptable rather than analyzed as failure cases. No qualitative error analysis is provided." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": false, 118 "justification": "Every experiment shows the system working as intended. No approaches that were tried and abandoned, no configurations that failed, and no ablations that hurt performance are reported." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims: 'reliably distinguishes benign and malicious executions' (supported by Tables 2-3, Figures 3-4), 'under 1% throughput overhead' (supported by Table 5: 0.3-1.0%). All abstract claims are backed by results." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "Causal claims (e.g., 'model substitution introduces systematic bias in logit outputs') are supported by formal proofs (Propositions 4.2-4.4, Appendix C) and controlled experiments that vary the deployment regime (benign vs malicious) while holding other factors constant." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper bounds its empirical claims to 'dense and MoE models' and specifies the exact models and attacks tested. The title 'Practical LLM Auditing Framework' is broad, but the theoretical guarantees are general and the experiments are presented as validation on specific architectures, not universal proof." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": true, 140 "justification": "Appendix F analyzes adaptive adversary strategies and proves (Proposition F.1) that honest logit commitment is the dominant strategy. The paper considers whether fabricated logits could evade detection and formally rules this out under the rationality assumption." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper explicitly defines LDD as a fidelity metric that captures 'approximation fidelity between a deployed execution and a claimed full-precision model' (Section 4.2). The proxy (LDD) is well-matched to the claim (detecting execution deviations), and Definition 4.1 formally specifies what is measured." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 6.1 specifies models by name and size: LLaMA3-70B, Qwen3-32B, Qwen3-30B-A3B, DeepSeek-V2-Lite. For open-source models with fixed weights, these identifiers are sufficient to locate the exact model checkpoints." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": false, 157 "justification": "Standard benchmarks (GSM8K, TriviaQA, WebQuestions) are named but the exact prompt format or template used to feed these to the models is not shown. The reader cannot reconstruct the exact prompts sent to vLLM." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": false, 162 "justification": "Section 6.1 states 'Top-20 token sampling strategy' and tensor parallelism degree 2, but temperature, top-p, and max_tokens are not reported. These significantly affect model output and LDD distributions." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. IMMACULATE is a verification/auditing framework, not an agentic system." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 6.1 documents: subsampling 500 prompts per dataset (1,500 total), random sampling of 200 from each for calibration setup, remaining held out for evaluation. The data pipeline from raw benchmarks to evaluation is traceable." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper has no dedicated limitations, threats to validity, or discussion section addressing weaknesses. The conclusion (Section 7) is two paragraphs focused entirely on contributions and future work." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": false, 184 "justification": "No specific threats to validity of the empirical evaluation are discussed. The threat model (Section 3) defines system assumptions but does not discuss limitations of the evaluation methodology." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 3 explicitly states scope boundaries: 'We assume Srv is rational and economically motivated,' 'A malicious server is assumed to misbehave on at least 10% of queries,' 'We exclude attacks that increase computational cost and manage to serve users with an alternative model.'" 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": false, 196 "justification": "Code is released but no raw experimental data (LDD distributions, per-request statistics, detection logs) is made available for independent verification. Only aggregated results in tables and figures are shown." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 6.1 describes data sources (GSM8K, TriviaQA, WebQuestions), subsampling procedure (500 per dataset), hardware (RTX 6000 Pro), software framework (vLLM), and precision formats (BF16, FP8)." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants. Data comes from standard public benchmarks." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "The pipeline is documented: benchmark prompts → subsample → split into setup (200/dataset) and evaluation sets → run inference under different regimes → compute LDD → apply detection rule → report results. Each transformation step is described." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding or acknowledgments section is present in the paper. Funding sources are not disclosed." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are clearly listed: National University of Singapore, Nanyang Technological University, Independent Researcher, University of California Berkeley." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "No funding is disclosed, so independence of the funder cannot be assessed. Absence of disclosure is not absence of conflict." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests statement or financial interest declaration appears in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": false, 239 "answer": false, 240 "justification": "The paper evaluates an auditing framework (a defense/tool), not a pre-trained model's capability on benchmarks. The benchmarks serve as input sources for generating inference traffic; model knowledge of benchmark answers is irrelevant to the evaluation." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": false, 244 "answer": false, 245 "justification": "Same as above: the paper tests a tool/defense rather than model knowledge. Train/test overlap does not affect the LDD measurement." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": false, 249 "answer": false, 250 "justification": "Same as above: benchmark contamination is irrelevant when the evaluation measures auditing detection rates, not model performance on benchmark tasks." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in this study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": true, 294 "justification": "Table 5 reports throughput overhead (0.3–1.0%) and per-request VC cost relative to GPU inference (400–900×). Section 6.5 explains that the amortized overhead is negligible since audited requests are <10⁻⁵ of total traffic." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "Hardware is specified (NVIDIA RTX 6000 Pro, 96GB, tensor parallelism degree 2) but total GPU hours, wall-clock time for the experiments, or total compute budget is not stated." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "No results across multiple random seeds are reported. All results appear to be from single runs." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "The number of experimental runs is not stated. Results are presented without indicating how many runs produced them." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "Section 6.4 explores different hyperparameter settings for t1 and t2 (Table 4), but the total search budget (how many configurations were tried, what search method was used) is not reported." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "Appendix E describes the ceremony for selecting t1 and t2: the model provider runs both deployed and quantized models on a prescribed dataset within a TEE, then selects the (t1, t2) pair yielding the lowest false positive rate while maintaining ≥5% detection rate." 322 }, 323 "multiple_comparison_correction": { 324 "applies": false, 325 "answer": false, 326 "justification": "The paper does not perform formal statistical hypothesis tests with p-values. Detection and false positive rates are reported as empirical proportions, not as statistical test outcomes requiring multiple comparison correction." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors evaluate their own IMMACULATE system against their own implementations of attacks and baselines. No discussion of author-evaluation bias or independent evaluation is present." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "Table 5 reports relative overhead (400–900× for VC vs inference) but performance is not reported as a function of compute budget. No performance curves across compute levels are shown." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "The paper uses GSM8K, TriviaQA, and WebQuestions as input sources but does not discuss whether these benchmarks are representative of real-world LLM API traffic or whether the LDD distributions would generalize to production query distributions." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No scaffolding is involved. IMMACULATE is a verification framework, not an agentic system." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": false, 352 "answer": false, 353 "justification": "The paper evaluates an auditing tool, not model knowledge on benchmarks. Temporal leakage is irrelevant to whether LDD can detect execution deviations." 354 }, 355 "feature_leakage_addressed": { 356 "applies": false, 357 "answer": false, 358 "justification": "Same as above: the evaluation measures auditing framework performance, not model capability. Feature leakage concepts do not apply." 359 }, 360 "non_independence_addressed": { 361 "applies": false, 362 "answer": false, 363 "justification": "Same as above: train/test independence is irrelevant when the evaluation tests a defense/tool rather than model knowledge." 364 }, 365 "leakage_detection_method": { 366 "applies": false, 367 "answer": false, 368 "justification": "Same as above: data leakage detection is not applicable to an auditing framework evaluation." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "LDD reliably distinguishes benign BF16 execution from FP8 quantization and model substitution across dense and MoE architectures.", 375 "evidence": "Table 2 shows tail probability for TV distance >0.1: BF16 0.017% vs FP8 1.7% vs substitution 11% for LLaMA3-70B. Similar separation across all four models. Figure 4 visualizes well-separated distributions (Section 6.2).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Per-request detection rates are sufficient for effective randomized auditing, with false positive rates below 10⁻⁵.", 380 "evidence": "Table 3 shows per-request detection rates from 1.3% (Qwen3 FP8) to 99% (LLaMA3 substitution on WebQuestions). False positive rates estimated via extreme value theory at <10⁻⁵ (Section 6.3).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "IMMACULATE imposes under 1% throughput overhead on benign servers.", 385 "evidence": "Table 5: throughput loss of 0.3% for LLaMA3-70B and Qwen3-32B, 0.9% for Qwen3-30B-A3B, 1.0% for DeepSeek-V2-Lite (Section 6.5).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Token overreporting reduces to a special case of model substitution under the hybrid computation model.", 390 "evidence": "Proposition 4.4, proved in Appendix C.2, constructs an alternative model M' with dummy recurrent steps that produces the same output but inflated token count, showing this is functionally a model substitution.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "An α=0.1 dishonest server is detected with overwhelming probability over a month of auditing ~3,000 requests per day.", 395 "evidence": "Section 6.3 derives: daily detection probability ≥0.3, so monthly evasion probability becomes negligible. Relies on per-request detection probability ≥10⁻³ and false positive rate ≤10⁻⁵.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "A rational adversary's dominant strategy is to honestly commit runtime logits.", 400 "evidence": "Proposition F.1 (Appendix F) proves that fabricating logits closer to the reference than the actual execution would require a better model approximation within the same compute budget, contradicting optimality of the deployed model.", 401 "supported": "strong" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "No uncertainty quantification", 407 "detail": "All results are point estimates with no error bars, confidence intervals, standard deviations, or multi-run statistics. Given that LDD distributions are stochastic (dependent on GPU non-determinism and sampling), single-run results may not be stable." 408 }, 409 { 410 "flag": "No limitations section", 411 "detail": "The paper lacks any discussion of limitations or threats to the validity of the empirical evaluation. Known limitations (single hardware platform, limited attack space, small evaluation scale vs claimed 'billions of requests/day' target) go unacknowledged." 412 }, 413 { 414 "flag": "Limited attack evaluation", 415 "detail": "Only two attack types are empirically tested: FP8 quantization and model substitution (smaller model). Token overreporting is dismissed via theoretical reduction. More sophisticated attacks (subtle quality degradation, selective cheating on specific query types, partial quantization) are not evaluated." 416 }, 417 { 418 "flag": "False positive rate estimated, not measured", 419 "detail": "Section 6.3 acknowledges: 'Accurately estimating the per-request false positive rate under benign execution is challenging due to the extreme rarity of tail events.' The FP rate is estimated via extreme value theory rather than directly measured, introducing model-dependency in a key metric." 420 }, 421 { 422 "flag": "Small evaluation scale", 423 "detail": "Only 1,500 total evaluation queries (500 per dataset) are used, while the framework targets production services processing 'billions of requests per day.' The gap between evaluation scale and claimed deployment scale is substantial." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "Language models are few-shot learners", 429 "authors": ["T. Brown", "B. Mann", "N. Ryder"], 430 "year": 2020, 431 "relevance": "Foundational GPT-3 paper establishing the paradigm of large-scale LLM API services that IMMACULATE audits." 432 }, 433 { 434 "title": "Evaluating large language models trained on code", 435 "authors": ["M. Chen"], 436 "year": 2021, 437 "arxiv_id": "2107.03374", 438 "relevance": "Codex/HumanEval paper relevant to LLM code generation capability evaluation." 439 }, 440 { 441 "title": "Chain-of-thought prompting elicits reasoning in large language models", 442 "authors": ["J. Wei", "X. Wang", "D. Schuurmans"], 443 "year": 2022, 444 "relevance": "Foundational prompting technique paper; reasoning token billing is a key motivation for IMMACULATE's token overreporting detection." 445 }, 446 { 447 "title": "Are you getting what you pay for? Auditing model substitution in LLM APIs", 448 "authors": ["W. Cai", "T. Shi", "X. Zhao", "D. Song"], 449 "year": 2025, 450 "arxiv_id": "2504.04715", 451 "relevance": "Primary prior work on LLM API auditing using TEEs; key baseline comparison for IMMACULATE." 452 }, 453 { 454 "title": "COIN: Counting the invisible reasoning tokens in commercial opaque LLM APIs", 455 "authors": ["G. Sun", "Z. Wang", "B. Tian"], 456 "year": 2025, 457 "arxiv_id": "2505.13778", 458 "relevance": "Addresses token overreporting in LLM APIs via embedding-based verification; complementary approach to IMMACULATE." 459 }, 460 { 461 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 462 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 463 "year": 2023, 464 "arxiv_id": "2305.05176", 465 "relevance": "Addresses cost optimization for LLM API usage; motivates the economic incentives IMMACULATE audits against." 466 }, 467 { 468 "title": "Efficient memory management for large language model serving with PagedAttention", 469 "authors": ["W. Kwon", "Z. Li", "S. Zhuang"], 470 "year": 2023, 471 "relevance": "vLLM paper; the inference framework IMMACULATE is built on top of." 472 }, 473 { 474 "title": "How is ChatGPT's behavior changing over time?", 475 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 476 "year": 2024, 477 "relevance": "Empirical evidence of LLM API behavior drift over time, motivating the need for auditing frameworks." 478 }, 479 { 480 "title": "Model equality testing: Which model is this API serving?", 481 "authors": ["I. Gao", "P. Liang", "C. Guestrin"], 482 "year": 2024, 483 "arxiv_id": "2410.20247", 484 "relevance": "Statistical approach to identifying which model an API serves; empirical auditing baseline for IMMACULATE." 485 }, 486 { 487 "title": "zkLLM: Zero knowledge proofs for large language models", 488 "authors": ["H. Sun", "J. Li", "H. Zhang"], 489 "year": 2024, 490 "relevance": "Cryptographic verification of LLM inference; key prior work showing the overhead limitations that IMMACULATE addresses." 491 }, 492 { 493 "title": "Evaluation and benchmarking of LLM agents: A survey", 494 "authors": ["M. Mohammadi", "Y. Li", "J. Lo", "W. Yip"], 495 "year": 2025, 496 "relevance": "Survey of LLM agent evaluation methods relevant to the survey scope of AI agent methodology." 497 }, 498 { 499 "title": "Is your LLM overcharging you? Tokenization, transparency, and incentives", 500 "authors": ["A. A. Velasco", "S. Tsirtsis", "N. Okati", "M. Gomez-Rodriguez"], 501 "year": 2025, 502 "arxiv_id": "2505.21627", 503 "relevance": "Economic analysis of LLM token billing incentive misalignment; complementary economic perspective to IMMACULATE's technical approach." 504 } 505 ] 506 }