scan.json (28397B)
1 { 2 "paper": { 3 "title": "RAudit: A Blind Auditing Protocol for Large Language Model Reasoning", 4 "authors": ["Edward Y. Chang", "Longling Geng"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2601.23133", 8 "doi": "10.48550/arXiv.2601.23133" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "theoretical"], 13 "key_findings": "RAudit is a diagnostic protocol for auditing LLM reasoning without ground truth, using CRIT-based reasonableness scores and PID control theory. Experiments on CAP-GSM8K and CausalL2 identify four failure mechanisms: Latent Competence Suppression (models derive correct answers then overwrite under social pressure), the False Competence Trap (weaker judges mask sycophancy), the Complexity-Vulnerability Tradeoff (causal tasks induce >10x higher sycophancy than math), and Iatrogenic Critique (authoritative correction harms weaker models). Llama 3.3 70B exceeded its clean accuracy after blind audit (96.2%→96.6%), while GPT-3.5 suffered 14.8% paranoia increase under authoritative framing with zero accuracy gain.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The CausalT5k/CausalL2 benchmark and CAP-GSM8K dataset are described but no download link or release URL is provided. The paper references 'our large-scale diagnostic benchmark' without releasing it." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, dependency files, or library versions are provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions or scripts are provided. The algorithm is specified in Appendix E (Algorithm 1), but there are no runnable reproduction steps." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Table 3 reports 95% CIs for all accuracy values (e.g., '92.4±2.3'). Table 5 also reports 95% CIs. The paper states 'binomial normal approximation per condition (n=500)' for CI computation." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "Table 5 notes 'Bold indicates statistically significant lift (p < 0.05)'. The paper performs significance testing on the lift results." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Effect sizes are reported as percentage point differences with context: e.g., 'Sycophancy Gap' (−4.0% to −8.8%), 'RAudit Lift' (+2.6% to +6.4%), paranoia rate changes (+14.8%). Baseline and post-intervention values are provided." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "N=500 for CAP-GSM8K and N=1,000 for CausalL2 are stated but no justification or power analysis is provided for these choices." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "95% CIs are reported across conditions in Tables 3 and 5, which implicitly convey variance. The CIs are computed via binomial normal approximation." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares Clean (no hints), Base (adversarial hints, no audit), Polite audit, and Strong audit conditions. The unaudited single-pass inference serves as the baseline (§4.2)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Models evaluated include GPT-4o, Claude 3.5 Sonnet, Gemini 2.5 Flash, and Llama 3.3 70B — all contemporary at time of writing. GPT-5.2 is used as a frontier judge." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "RQ3 ablates judge strength (GPT-4o vs GPT-5.2) and RQ4 ablates critique tone (Polite vs Authoritative). Table 4 shows lift progression across protocol levels (Polite, Auth v1, Strong Causal v2)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: accuracy, paranoia rate, realignment rate, sycophancy ratio, net effect, detection recall, dissonance rate, and paranoia tax (§4.2)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of system outputs is performed. All evaluation is automated using LLM judges (GPT-4o and GPT-5.2). The paper acknowledges 'the auditor is itself an LLM and may inherit biases' but does not include human verification." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "No explicit train/test/validation split is described. CAP-GSM8K is derived from GSM8K and CausalL2 from CausalT5k, but there is no discussion of held-out evaluation sets or separation of tuning and test data for hyperparameters like ρ*=0.8." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per model (Tables 3-7), per audit persona (Polite vs Strong), per judge (GPT-4o vs GPT-5.2), per domain (math vs causal), per failure mode (Table 20), and per trap type (Table 13)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Extensive failure case analysis: 12 case studies in Appendix D (6 resonance, 6 dissonance), 51 universally stubborn cases in Appendix I with detailed failure mode catalog, and the 'Structural Ceiling' limitation is thoroughly discussed." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Multiple negative results: Iatrogenic Critique (authoritative tone harms GPT-3.5, Table 7), GPT-4o's consistently negative lift on causal tasks (Table 4), the Paranoia Threshold, and the ~50% Dissonance Rate across all models." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about four mechanisms (Latent Competence Suppression, False Competence Trap, Complexity-Vulnerability Tradeoff, Iatrogenic Critique) are all supported by specific experimental results in Tables 3-7 and §4.3-4.5." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The experimental design uses controlled manipulation: adversarial hint injection (Clean→Base) and audit interventions (Base→Final) with controlled conditions. The ablation design (varying one factor at a time: judge, tone, protocol level) supports causal claims about those factors." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Claims are bounded to the tested models, datasets, and conditions. The Limitations section explicitly states 'the auditor is itself an LLM and may inherit biases' and acknowledges 51 stubborn cases as a ceiling. The paper scopes results to 'mathematical reasoning (CAP-GSM8K) and causal judgment (CausalL2)' specifically." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper discusses multiple alternative explanations: diagnostic misalignment by the auditor (Appendix F), the Declarative-Procedural Gap as an alternative to simple sycophancy, and acknowledges that 'process verification can detect inconsistency but cannot correct errors when the derivation itself is coherently biased' (§5)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly distinguishes between its proxy (CRIT-based reasonableness score ρ) and the outcome (reasoning quality/correctness). It states 'the primary contribution is the measurement framework itself, not guaranteed improvement' (§1) and acknowledges that process verification has limits distinct from outcome correctness." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are listed as 'GPT-3.5 Turbo', 'Llama 3.3 70B', 'GPT-4o', 'Claude 3.5 Sonnet', 'Gemini 2.5 Flash', 'GPT-5.2' — marketing names without snapshot dates or API version identifiers." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Prompt content is described in natural language (e.g., 'Polite vs Authoritative personas', 'Structural Nudges naming trap type') and example audit feedback is shown in case studies, but the full actual prompt templates used are not provided." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "ρ*=0.8 and Tmax=5 are stated (§4.2), and PID gain conditions are described abstractly, but specific values for Kp, Ki, Kd, γβ, δs, δJS, ε, µ, and LLM API parameters (temperature, top-p) are not reported." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The RAudit scaffolding is described in detail: Algorithm 1 (Appendix E) specifies the full control loop, Figures 1-3 show the architecture for single-agent and multi-agent modes, the PID controller, quadrant-based intervention routing, and termination conditions are all formally specified." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "CAP-GSM8K is described as adversarial hint injection into GSM8K (N=500 per model, hints h≠y*). CausalL2 is described as the L2 subset of CausalT5k (N=1,000), with trap types listed and pressure protocol specified (§4.2)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "A dedicated 'Limitations and the Structural Ceiling' subsection appears in §5, plus the Impact Statement discusses deployment risks, evaluation methodology limits, and broader limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats discussed: the auditor inherits LLM biases, 51 specific cases resisted all intervention, diagnostic misalignment by the auditor (Appendix F shows auditor precision drops to 14.5-33.3% for certain trap types), and the 'Structural Ceiling' where coherently biased traces fool process verification." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states what it does NOT show: 'process verification can detect trace-output inconsistency but cannot correct errors when the trace itself is coherently wrong' (§5), 'Adversarial attacks targeting the auditor remain unexplored' (Impact Statement), and 51 cases where the approach fundamentally fails are cataloged." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (model traces, CRIT scores, per-case results) is released. Only aggregate statistics are reported in tables." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data collection is described: CAP-GSM8K is built from GSM8K with adversarial hint injection (§4.2), CausalL2 is a subset of CausalT5k with specific trap types listed, and the evaluation protocol (5 states for RQ1, behavioral transitions for RQ3-4) is detailed." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard benchmarks (GSM8K) and a constructed benchmark (CausalT5k)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from raw GSM8K → hint injection → base response → audit → refinement is documented with transition counts (Table 8: 5,546 T→T, 235 F→F, 120 T→F, 95 F→T out of 5,996 trials)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Authors are listed as affiliated with Stanford University (§1 header)." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the five models evaluated." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "GSM8K (published 2021) is widely known to be in training data of modern LLMs, but the paper does not discuss whether models may have memorized GSM8K solutions. CausalT5k is novel, but no contamination analysis is performed." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "GSM8K was published in 2021 and is likely in the training data of all models tested (GPT-4o, Claude 3.5, etc.). The paper does not address this contamination risk at all, which is particularly problematic for a sycophancy study where memorized answers could affect the baseline." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "The paper mentions GPT-5.2 is '~17× more expensive' than GPT-4o (§4.2) but does not report actual API costs, tokens consumed, or cost per example for the RAudit protocol." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget is stated. The paper runs 5 models × multiple conditions × thousands of examples but does not quantify total API spend or compute hours." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of random seed sensitivity or results across multiple seeds. LLM outputs are stochastic but seed variation is not analyzed." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not state whether each condition was run multiple times or is a single-pass evaluation. Sample sizes (N=500, N=1000) refer to number of problems, not number of runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "ρ*=0.8 and Tmax=5 are stated but no justification for these values is given. No hyperparameter search budget or sensitivity analysis is reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper presents results for the 'Strong Causal' protocol as the best configuration but does not describe how this protocol was developed or selected. PID gain values are not reported." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper compares 5 models across multiple conditions and metrics with significance tests but does not mention any multiple comparison correction (Bonferroni, etc.)." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors evaluate their own RAudit system against their own baselines and their own CausalT5k benchmark without acknowledging author-evaluation bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "RAudit adds multiple audit rounds (avg 1.54-1.98 rounds per Table 19) with additional LLM calls, but performance is not plotted as a function of compute. The paper notes GPT-5.2 is 17x more expensive but does not analyze cost-performance tradeoffs." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Table 2 explicitly compares CausalT5k against prior benchmarks (CLadder, CRASS, CORR2CAUSE, e-CARE, TruthfulQA) on dimensions of construct validity (Levels, Traps, Pressure, Two-Axis evaluation). Section A.1 discusses why existing benchmarks are insufficient." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "RAudit itself is the scaffold being tested, but the paper does not control for scaffold effects when comparing model performance. Different models may respond differently to the PID control mechanism, confounding model capability with scaffold interaction." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "GSM8K (2021) predates all models tested. The adversarial hints modify the problems, but the base solutions may be memorized. No temporal leakage analysis is provided." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The adversarial hint injection itself introduces information about the ground truth (h≠y*), but the paper does not discuss whether this creates feature leakage concerns beyond the intended sycophancy pressure." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether CausalL2 vignettes share structural templates that could allow pattern-matching rather than genuine causal reasoning." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection method is applied. No canary strings, membership inference, or decontamination for GSM8K." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Adversarial user pressure erodes 4.0-8.8% accuracy across all model tiers on mathematical reasoning (CAP-GSM8K)", 365 "evidence": "Table 3 shows Sycophancy Gap from Clean to Base: GPT-3.5 −8.8%, Llama 3.3 −6.0%, Gemini −4.2%, GPT-4 −4.2%, GPT-4o −5.6%, Claude −4.0% (all N=500 with 95% CIs)", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Blind audit (without ground truth) recovers suppressed reasoning, with Llama 3.3 70B exceeding its clean capability (96.2%→96.6%)", 370 "evidence": "Table 3: all models show positive RAudit Lift (+2.6% to +6.4%) under Strong audit. Llama exceeds Clean accuracy by +0.4%.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Causal tasks induce >10× higher sycophancy than mathematical tasks (30.1% vs 2.1% bad-flip rate)", 375 "evidence": "Stated in §4.4 RQ2 Summary as cross-domain comparison. Individual model bad-flip rates not broken out per domain in a single table.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Weaker judges mask sycophancy that stronger judges expose (False Competence Trap): Claude shifts from Q1-Discerning to Q4-Sycophantic when switching from GPT-4o to GPT-5.2 judge", 380 "evidence": "Tables 6 and 14-15: Claude paranoia nearly doubles (17.1%→33.3%) and sycophancy ratio triples (0.49→1.55) under GPT-5.2. Net effect reverses from +30 to −11 (Table 16).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Authoritative critique increases GPT-3.5's paranoia by 14.8% with zero accuracy gain (Iatrogenic Critique)", 385 "evidence": "Table 7/18: GPT-3.5 ΔPara=+14.8%, ΔReal=+0.0%, ΔNet=−16 under authoritative vs polite critique.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "RAudit provides formal guarantees of bounded correction (Proposition 1) and O(log(1/ε)) termination (Proposition 2)", 390 "evidence": "Proofs in Appendix B. Bounded correction requires gain condition Kp + TmaxKi + 2Kd < 1/γβ. Termination requires contraction E[JS(t+1)|F(t)] ≤ κ·JS(t).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "~50% Dissonance Rate across all models: detection does not guarantee correction", 395 "evidence": "Table 5: Dissonance Rates 47.9%-55.2% across 5 models under Strong Causal protocol (N=1,000 per model with 95% CIs).", 396 "supported": "strong" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "Benchmark contamination unaddressed", 402 "detail": "GSM8K (2021) is extremely likely to be in the training data of all models tested (2024-2026 vintage). The paper studies sycophancy under adversarial hints but does not discuss whether memorized solutions affect the Clean baseline or the model's susceptibility to hints. A model that has memorized '42' may behave differently under a hint of '50' than a model genuinely computing." 403 }, 404 { 405 "flag": "Self-evaluation of own benchmark and system", 406 "detail": "The authors propose RAudit and CausalT5k, then evaluate RAudit using CausalT5k. No independent evaluation or external validation is included. The benchmarks, protocol, and evaluation are all from the same team." 407 }, 408 { 409 "flag": "Missing PID hyperparameters", 410 "detail": "The PID controller is a core theoretical contribution, but specific gain values (Kp, Ki, Kd, γβ) used in experiments are never reported. This makes the theoretical guarantees unverifiable against the actual experimental configuration." 411 }, 412 { 413 "flag": "No cost accounting for multi-round LLM calls", 414 "detail": "RAudit adds 1.5-2 additional LLM rounds per example on average (Table 19), and the stronger GPT-5.2 judge is 17x more expensive. The practical cost of deploying RAudit is never quantified, undermining claims about real-world applicability." 415 }, 416 { 417 "flag": "Single-run results without seed analysis", 418 "detail": "LLM outputs are stochastic, but the paper does not report results across multiple seeds or runs. The CIs are based on binomial approximation over problems, not over experimental repetitions, so they do not capture run-to-run variance." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "Towards understanding sycophancy in language models", 424 "authors": ["M. Sharma"], 425 "year": 2024, 426 "relevance": "Core prior work on sycophancy as RLHF side-effect, directly motivates the sycophancy detection aspect of RAudit." 427 }, 428 { 429 "title": "Language models don't always say what they think: Unfaithful explanations in chain-of-thought prompting", 430 "authors": ["M. Turpin", "J. Michael", "E. Perez", "S. R. Bowman"], 431 "year": 2023, 432 "arxiv_id": "2305.04388", 433 "relevance": "Documents unfaithful CoT explanations in LLMs, directly relevant to trace-output inconsistency detection." 434 }, 435 { 436 "title": "Large language models cannot self-correct reasoning yet", 437 "authors": ["J. Huang"], 438 "year": 2024, 439 "relevance": "Shows LLMs fail to self-correct without ground truth, motivating RAudit's external auditor approach." 440 }, 441 { 442 "title": "Self-consistency improves chain of thought reasoning in language models", 443 "authors": ["X. Wang", "J. Wei", "D. Schuurmans"], 444 "year": 2023, 445 "relevance": "Self-consistency as a reasoning improvement baseline that RAudit extends with process verification." 446 }, 447 { 448 "title": "LLM evaluators recognize and favor their own generations", 449 "authors": ["A. Panickssery", "S. R. Bowman", "S. Feng"], 450 "year": 2024, 451 "relevance": "Documents self-preference bias in LLM-as-judge, directly relevant to RAudit's cross-family evaluator design." 452 }, 453 { 454 "title": "Inverse scaling: When bigger isn't better", 455 "authors": ["I. R. McKenzie"], 456 "year": 2023, 457 "arxiv_id": "2306.09479", 458 "relevance": "Documents the scaling paradox where larger models are more susceptible to certain failures, supporting RAudit's findings." 459 }, 460 { 461 "title": "TruthfulQA: Measuring how models mimic human falsehoods", 462 "authors": ["S. Lin", "J. Hilton", "O. Evans"], 463 "year": 2022, 464 "relevance": "Benchmark for measuring model truthfulness under social pressure, related to sycophancy evaluation." 465 }, 466 { 467 "title": "Constitutional AI: Harmlessness from AI feedback", 468 "authors": ["Y. Bai"], 469 "year": 2022, 470 "arxiv_id": "2212.08073", 471 "relevance": "Training-time alignment approach whose downstream iatrogenic effects RAudit reveals at inference time." 472 }, 473 { 474 "title": "Self-Refine: Iterative refinement with self-feedback", 475 "authors": ["A. Madaan", "N. Tandon", "P. Gupta"], 476 "year": 2023, 477 "relevance": "Iterative self-improvement baseline that RAudit extends with external auditor and formal guarantees." 478 }, 479 { 480 "title": "Why do multi-agent LLM systems fail?", 481 "authors": ["M. Cemri"], 482 "year": 2025, 483 "arxiv_id": "2503.13657", 484 "relevance": "Documents failure modes in multi-agent LLM systems including majority-opinion collapse, motivating RAudit's asymmetric evaluation design." 485 }, 486 { 487 "title": "BrokenMath: A benchmark for sycophancy in theorem proving with LLMs", 488 "authors": ["I. Petrov", "J. Dekoninck", "M. Vechev"], 489 "year": 2025, 490 "arxiv_id": "2510.04721", 491 "relevance": "Sycophancy benchmark for mathematical reasoning, directly related to RAudit's CAP-GSM8K evaluation." 492 }, 493 { 494 "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters", 495 "authors": ["C. Snell", "J. Lee", "K. Xu", "A. Kumar"], 496 "year": 2024, 497 "relevance": "Foundational work on inference-time scaling that RAudit aims to regulate and audit." 498 } 499 ] 500 }