ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (19625B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "IMMACULATE: A Practical LLM Auditing Framework via Verifiable Computation",
      6     "authors": [
      7       "Yanpei Guo",
      8       "Wenjie Qu",
      9       "Linyu Wu",
     10       "Shengfang Zhai",
     11       "Lionel Z. Wang",
     12       "Ming Xu",
     13       "Yue Liu",
     14       "Binhang Yuan",
     15       "Dawn Song",
     16       "Jiaheng Zhang"
     17     ],
     18     "year": 2026,
     19     "venue": "arXiv",
     20     "arxiv_id": "2602.22700",
     21     "doi": null
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All abstract claims are substantiated: <1% throughput overhead is confirmed in Table 5 (0.3%–1.0% across four models), detection of model substitution/quantization is shown in Tables 2–3, and no trusted hardware requirement is demonstrated through the TEE-free inference path.",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper makes causal claims about LDD distinguishing benign from malicious execution, supported by controlled experiments running identical prompt sets under BF16 (benign), FP8 (quantization attack), and substitute model (substitution attack) conditions, with Propositions 4.2–4.4 formally proving the statistical signatures.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper tests four models and three datasets (1500 total prompts) but draws broad claims about compatibility with 'existing large-scale LLM deployments' without explicitly bounding generalization to the tested architectures, precision formats, or task distributions.",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper proves that truthful logit commitment is optimal for rational adversaries (Appendix F), but does not discuss alternative explanations for LDD separation failure (e.g., models with unusual logit distributions, or quantization schemes designed to minimize TV distance).",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper clearly distinguishes the measured quantity (logit TV-distance distribution tail probability) from the claimed property (detection of economically motivated API deviations), with formal propositions connecting each attack type to a distinct LDD signature.",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion only mentions future work directions without acknowledging weaknesses of the current approach.",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "While the paper acknowledges that direct empirical FP estimation is infeasible (requiring EVT extrapolation) and that FP8 per-request detection is low, these are not framed as threats to validity and no systematic validity analysis is presented.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper does not explicitly state what IMMACULATE does not detect (e.g., output quality degradation without measurable quantization, non-autoregressive architectures, prompt injection) or under what conditions the LDD approach would break down.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No funding acknowledgment or grant disclosure appears anywhere in the paper.",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Author affiliations are clearly disclosed in the header: National University of Singapore, Nanyang Technological University, Independent Researcher, and UC Berkeley.",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No competing interests statement, patent declaration, or financial interests disclosure appears in the paper.",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Key terms are precisely defined: verifiable computation (Section 2.2), the hybrid computation model and its components (Section 4.1), logit distance distribution (Definition 4.1), α-dishonest execution, and auditing properties (completeness, soundness, efficiency, privacy, generality) in Section 3.",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The introduction explicitly lists three contributions: (1) the IMMACULATE auditing framework, (2) the LDD metric enabling verification under numerical non-determinism, and (3) a prototype implementation on vLLM with <1% overhead and published code.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 2.3 and Appendix A systematically categorize prior auditing work into three paradigms (empirical, GPU-TEE, cryptographic), Table 1 shows IMMACULATE's advantage across all three dimensions, and individual prior works are analyzed for specific limitations IMMACULATE addresses.",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "benchmark-creation": {
    125       "construct_design": {
    126         "construct_validity_argued": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Propositions 4.2–4.4 with formal proofs (Appendix C) argue why LDD measures approximation fidelity: model substitution produces systematic logit bias, precision reduction produces increased variance, and token overreporting is formally reduced to model substitution.",
    130           "source": "haiku"
    131         },
    132         "difficulty_distribution_characterized": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper evaluates only two attack severity levels (FP8 quantization and full model substitution) without characterizing a difficulty gradient or testing intermediate attack strengths (e.g., INT8, different model size gaps).",
    136           "source": "haiku"
    137         },
    138         "ceiling_floor_effects_checked": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Per-request FP8 detection rates as low as 1.3% represent a near-floor effect for quantization detection, but the paper does not acknowledge this as a ceiling/floor problem or calculate how many audited requests are needed to achieve reliable detection in practice.",
    142           "source": "haiku"
    143         },
    144         "human_baseline_included": {
    145           "applies": false,
    146           "answer": false,
    147           "justification": "IMMACULATE is an automated auditing framework for LLM API integrity, not a benchmark measuring human-level capabilities; a human performance baseline is not applicable.",
    148           "source": "haiku"
    149         },
    150         "scoring_rubric_justified": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "The threshold selection procedure (ceremony in Appendix E, parameter study in Table 4) is described and its robustness demonstrated across a range of hyperparameter values, with the FP/detection rate trade-off characterized across four models.",
    154           "source": "haiku"
    155         }
    156       },
    157       "robustness": {
    158         "contamination_resistance_designed": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Appendix F proves that truthful logit commitment is the dominant strategy for rational adversaries — strategic logit fabrication cannot outperform honest commitment because generating closer-to-ideal logits would require executing a better model, contradicting the cost-saving motivation.",
    162           "source": "haiku"
    163         },
    164         "temporal_robustness_discussed": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The paper does not discuss whether LDD-based detection remains effective as quantization techniques improve, or whether threshold ceremonies must be repeated as model families evolve — there is no plan for keeping the framework current.",
    168           "source": "haiku"
    169         },
    170         "failure_modes_discussed": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Some failure modes are implicitly present (low FP8 detection rates, EVT-based FP estimation) but the paper provides no systematic discussion of what attacks the framework cannot detect or conditions under which LDD separation would collapse.",
    174           "source": "haiku"
    175         },
    176         "baseline_implementations_provided": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Code is published at https://github.com/guo-yanpei/Immaculate, built on vLLM and HuggingFace Transformers, with complete algorithmic pseudo-code for the server, auditor, and VC procedures in Appendices B–D.",
    180           "source": "haiku"
    181         }
    182       },
    183       "documentation": {
    184         "dataset_documentation_complete": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "The paper uses standard benchmarks (GSM8K, TriviaQA, WebQuestions) with 500 prompts each but provides no data card, preprocessing details, or documentation distinguishing the 200-prompt calibration sets from the evaluation sets beyond a one-sentence description.",
    188           "source": "haiku"
    189         },
    190         "licensing_and_access_clear": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "A GitHub URL is provided for the code but no license is mentioned in the paper, and the terms under which others can use or extend IMMACULATE are not specified.",
    194           "source": "haiku"
    195         },
    196         "intended_use_specified": {
    197           "applies": true,
    198           "answer": false,
    199           "justification": "Intended use is implied (auditing black-box LLM APIs for economically motivated deviations) but the paper does not specify what should NOT be concluded (e.g., that passing IMMACULATE audits guarantees output quality or absence of all forms of manipulation).",
    200           "source": "haiku"
    201         }
    202       }
    203     }
    204   },
    205   "claims": [
    206     {
    207       "claim": "IMMACULATE achieves under 1% throughput overhead for benign LLM servers",
    208       "evidence": "Table 5 shows 0.3% throughput loss for LLaMA3-70B and Qwen3-32B, 0.9% for Qwen3-30B-A3B, 1.0% for DeepSeek-V2-Lite",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "LDD reliably distinguishes benign BF16 execution from model substitution at the per-request level",
    213       "evidence": "Table 3 shows model substitution detection rates of 42–99% across models and datasets with estimated FP < 10^-5",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "LDD can detect FP8 quantization attacks sufficient for effective randomized auditing",
    218       "evidence": "Table 3 shows per-request FP8 detection rates of 1.3–10.3%; the paper argues 1% per-request detection suffices for the randomized scheme, but the math uses α=0.1 dishonesty rate and requires thousands of audited requests",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Token overreporting is formally equivalent to model substitution under the hybrid computation model",
    223       "evidence": "Proposition 4.4 with proof in Appendix C.2 constructs a modified model M' with dummy recurrent steps that preserve output sequence but inflate reported token count T by K",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "False positive rate for benign servers is below 10^-5",
    228       "evidence": "EVT-based tail modeling applied because direct empirical estimation was infeasible given compute constraints; all observed benign requests were correctly classified, but this is not a direct measurement",
    229       "supported": "moderate"
    230     },
    231     {
    232       "claim": "A rational adversary's dominant strategy is to commit logits from the best affordable model approximation rather than fabricate logits",
    233       "evidence": "Proposition F.1 proves that producing Lfake closer to M* than Mopt would require computing a better approximation within the same budget, contradicting optimality of Mopt",
    234       "supported": "strong"
    235     }
    236   ],
    237   "methodology_tags": [
    238     "theoretical",
    239     "benchmark-eval"
    240   ],
    241   "key_findings": "IMMACULATE combines randomized auditing with a novel Logit Distance Distribution (LDD) metric to detect economically motivated LLM API deviations without trusted hardware, achieving <1% throughput overhead across four tested models. Model substitution attacks are detectable at 40–99% per-request rates, while FP8 quantization yields lower 1.3–10% rates — both with estimated false positives below 10^-5. Token overreporting is formally reduced to model substitution under a hybrid computation abstraction. The adaptive adversary analysis proves that rational servers cannot benefit from logit fabrication, closing the strategic evasion loophole.",
    242   "red_flags": [
    243     {
    244       "flag": "FP8 detection rates near floor",
    245       "detail": "Per-request FP8 detection rates of 1.3–10.3% are very low; the paper asserts this is sufficient given large-scale randomized auditing but does not calculate the expected days-to-detection for a 10% FP8-using server at these rates."
    246     },
    247     {
    248       "flag": "False positive rate not directly measured",
    249       "detail": "FP rate is estimated via Extreme Value Theory extrapolation because direct empirical measurement was infeasible under compute constraints — the claim of FP < 10^-5 is model-dependent and unvalidated against ground truth."
    250     },
    251     {
    252       "flag": "Token overreporting not empirically evaluated",
    253       "detail": "Token overreporting is excluded from all experiments with only a theoretical reduction to model substitution, leaving the framework's ability to detect real-world billing manipulation empirically unverified."
    254     },
    255     {
    256       "flag": "Threshold ceremony is heuristic and circular",
    257       "detail": "Section 6.4 acknowledges parameters are 'largely heuristic'; the ceremony requires the model provider to run both their deployed and reference models in a TEE, creating a dependency on honest provider cooperation during setup."
    258     },
    259     {
    260       "flag": "No limitations section",
    261       "detail": "No dedicated limitations or threats-to-validity section; the paper does not discuss attacks that could minimize LDD (e.g., quantization schemes tuned to preserve logit distributions) or failure modes for future model architectures."
    262     },
    263     {
    264       "flag": "Small evaluation sample",
    265       "detail": "Only 500 prompts per dataset (1500 total evaluation queries, 600 used for threshold calibration) — limited statistical basis for the extreme-tail probability estimates underpinning the FP < 10^-5 claim."
    266     }
    267   ],
    268   "cited_papers": [
    269     {
    270       "title": "Are you getting what you pay for? Auditing model substitution in LLM APIs (Cai et al., 2025)",
    271       "relevance": "Most closely related prior work; establishes the threat model IMMACULATE directly addresses and demonstrates GPU-TEE as a competing approach"
    272     },
    273     {
    274       "title": "zkLLM: Zero knowledge proofs for large language models (Sun et al., 2024)",
    275       "relevance": "Prior cryptographic verification approach whose integer-only arithmetic requirement IMMACULATE is designed to circumvent"
    276     },
    277     {
    278       "title": "zkGPT: An efficient non-interactive zero-knowledge proof framework for LLM inference (Qu et al., 2025)",
    279       "relevance": "Another ZKP-based LLM verification approach; context for cryptographic overhead IMMACULATE avoids"
    280     },
    281     {
    282       "title": "CoIn: Counting the invisible reasoning tokens in commercial opaque LLM APIs (Sun et al., 2025a)",
    283       "relevance": "Prior work on token overreporting detection via Merkle commitments; IMMACULATE provides a different mechanism and argues CoIn is bypassable"
    284     },
    285     {
    286       "title": "TopLoc: A locality sensitive hashing scheme for trustless verifiable inference (Ong et al., 2025)",
    287       "relevance": "Alternative verification approach for public models; motivates why proprietary model auditing requires a different approach"
    288     },
    289     {
    290       "title": "Model equality testing: Which model is this API serving? (Gao et al., 2024)",
    291       "relevance": "Statistical empirical approach to model identification; represents the competing class of approaches lacking formal guarantees"
    292     },
    293     {
    294       "title": "Is your LLM overcharging you? Tokenization, transparency, and incentives (Velasco et al., 2025)",
    295       "relevance": "Economic analysis of token overreporting incentives; complementary work addressing incentive misalignment rather than technical verification"
    296     },
    297     {
    298       "title": "Nondeterminism-aware optimistic verification for floating-point neural networks (Yao et al., 2025)",
    299       "relevance": "Prior work addressing GPU non-determinism in neural network verification; context for IMMACULATE's LDD approach to the same challenge"
    300     }
    301   ],
    302   "engagement_factors": {
    303     "practical_relevance": {
    304       "score": 3,
    305       "justification": "Directly addresses a concrete trust gap for every enterprise using commercial LLM APIs, with working code and <1% overhead that makes deployment viable today."
    306     },
    307     "surprise_contrarian": {
    308       "score": 2,
    309       "justification": "Challenges the assumption that auditing LLM execution requires either trusted hardware or prohibitive cryptographic overhead, demonstrating a practical middle path."
    310     },
    311     "fear_safety": {
    312       "score": 2,
    313       "justification": "Highlights that LLM providers have documented economic incentives to silently substitute cheaper models and overbill tokens, with real forum reports as evidence of the accountability gap."
    314     },
    315     "drama_conflict": {
    316       "score": 1,
    317       "justification": "References real forum complaints about GPT quality degradation, but the paper is technical rather than accusatory toward any specific provider."
    318     },
    319     "demo_ability": {
    320       "score": 2,
    321       "justification": "Working code is published on GitHub built on vLLM, but running it requires access to 70B-parameter models and multi-GPU infrastructure, limiting casual reproducibility."
    322     },
    323     "brand_recognition": {
    324       "score": 1,
    325       "justification": "Dawn Song (UC Berkeley, known for security research) adds some name recognition, but there is no major AI lab affiliation and the paper evaluates only open-source models."
    326     }
    327   },
    328   "hn_data": {
    329     "threads": [],
    330     "top_points": 0,
    331     "total_points": 0,
    332     "total_comments": 0
    333   }
    334 }

Impressum · Datenschutz