scan.json (32774B)
1 { 2 "paper": { 3 "title": "ReLE: A Scalable System and Structured Benchmark for Diagnosing Capability Anisotropy in Chinese LLMs", 4 "authors": [ 5 "Rui Fang", 6 "Jian Li", 7 "Wei Chen", 8 "Bin Hu", 9 "Ying-Cong Chen", 10 "Xin Tang", 11 "Liang Diao" 12 ], 13 "year": 2026, 14 "venue": "arXiv.org", 15 "arxiv_id": "2601.17399", 16 "doi": "10.48550/arXiv.2601.17399" 17 }, 18 "scan_version": 2, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "Section 7.1 states 'We plan to release an anonymized subset of the failure case repository, along with the evaluation scripts and configuration files.' This is a promise of future release, not an actual release." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "The benchmark data (207,843 samples) is not released. Section 7.1 mentions planned release of scripts and a filtered failure case repository. The Domain-Specific Private Set (20%) is proprietary. No download links are provided." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No requirements.txt, Dockerfile, or environment specifications are provided. The system architecture is described conceptually (Section 3) but without sufficient detail to recreate the evaluation environment." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided. The paper describes the system architecture and methodology but does not include commands, scripts, or a README for reproducing the evaluation." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": true, 47 "justification": "Table 3 reports ±SD for all domain scores. Section 6.1 reports bootstrap 95% confidence intervals for RSA: [10.2, 12.6] for ReLE vs [4.1, 5.9] for baselines, from 1,000 bootstrap samples." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": true, 52 "justification": "Section 6.1 reports Kolmogorov-Smirnov test (D = 0.42, p < 10^-5) for RSA distribution differences. Bootstrap resampling with 1,000 iterations is used for confidence intervals. p-values reported (p < 0.001, p < 0.01)." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Effect sizes are reported throughout: 70% cost reduction ($69K → $20.7K), RSA 11.4 vs ~5.0, ρ = 0.96 ranking correlation, Cohen's κ = 0.81 judge-human agreement, Anisotropy Index 0.74, Healthcare gap ~12 points." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The paper evaluates 304 models across 207,843 samples but provides no power analysis or explicit justification for these sample sizes. The 50-model control experiment subset (Section 6.1.1) is also not justified." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": true, 67 "justification": "Table 3 reports ±SD for all domain scores across model categories. The Neyman allocation strategy explicitly estimates and uses stratum variance (S²_h,m). Bootstrap confidence intervals provide variance estimates for RSA." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Table 1 compares ReLE against GLUE, HELM, and OpenCompass. Section 6.1 compares RSA values against C-Eval and CLUE baselines under identical reweighting protocols." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Baselines include OpenCompass (2024), LiveBench (2024), Arena-Hard (2024), Chatbot Arena (2024), and CompassJudger (2025). These are recent and representative of the state of the art." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": false, 84 "justification": "Table 4 provides a control experiment comparing dynamic sampling vs. full-set evaluation, but this validates one component. There is no systematic ablation of the system's core components (hybrid scoring stages, prompt schema adapter, variance-aware scheduler) to isolate their individual contributions." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Multiple metrics are used: Rank Stability Amplitude (RSA), Capability Inconsistency (CI), Anisotropy Index, Spearman correlation (ρ), Cohen's κ, Pearson r, end-to-end latency, and cost metrics." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "Section 3.2 reports Cohen's κ = 0.81 between GPT-4o judge and human annotators on 500 adversarial samples. Human expert review validates 10% of perturbed dataset samples (Section 4.1). Inter-annotator agreement ≥96.8% for prompt schema." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 4.3 describes a Private Anchor Set (PAS) of 5,000 newly constructed samples 'strictly isolated from the internet.' The Domain-Specific Private Set (20%) serves as 'a strictly held-out test set.'" 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Table 3 provides per-domain breakdown across 7 core domains and 3 model categories. Table 2 shows the Domain × Capability matrix. Figure 2 shows per-dimension radar plots. Section 6.3 breaks down failure patterns by domain." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 6.3 presents systematic failure pattern analysis from 2.1M+ failure cases, identifying three patterns: Popular Benchmark Overfitting (41%), Domain-Dependent Gaps, and Size-Independent Gaps (18% of ≥20B models). Figure 4 visualizes failure distribution." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": false, 114 "justification": "The paper does not report approaches that were tried and abandoned, or configurations of their own system that failed. All design choices are presented as successful. The negative findings about evaluated models (overfitting, size-independent gaps) are about the subjects, not the method." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims of 70% cost reduction (supported by Section 3.3/6.4), ρ = 0.96 ranking correlation (Table 4), RSA 11.4 vs ~5.0 (Section 6.1), and 207,843 samples across 304 models (Section 4) are all substantiated in the paper body." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper mostly uses appropriate language: correlational ('correlates more with instruction tuning, r = 0.65, than scale, r = 0.48'). The system-level causal claim (dynamic scheduling reduces cost) is validated with the control experiment in Table 4. Section 6.1.1 explicitly isolates sampling noise from anisotropy." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper explicitly bounds scope to Chinese LLMs and 7 domains. Section 7.2 states 'emerging areas like Industrial IoT are not yet included.' Section 7.1 notes the architecture is 'language-agnostic' but is 'instantiated with Chinese-language benchmarks in this work.' The paper positions itself as diagnostic, not comprehensive." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": true, 136 "justification": "Section 6.1.1 explicitly tests whether RSA is an artifact of sampling noise vs genuine anisotropy (94.8% structural, 5.2% noise). Section 5.1 explains the 'Agent Score Anomaly' as format alignment rather than capability. Section 6.1 considers whether instability is a 'statistical artifact.'" 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper's central argument IS that aggregate scores are misleading proxies. Section 1.1 explicitly argues single-score rankings create 'an illusion of progress.' The RSA metric quantifies the gap between aggregate ranking (proxy) and structured capability assessment (outcome). Section 5.1 distinguishes 'latent capability and interface compliance.'" 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "The judge model is specified as 'GPT-4o-0513' (specific version), and secondary judges as 'Qwen-Max' and 'Claude-3.7-Sonnet.' However, the 304 evaluated models are referenced by marketing names (e.g., 'Llama 3', 'Gemini-3-Pro', 'DeepSeek-V2/V3') without specific version dates or API snapshots." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "Section 3.1 describes the Unified Prompt Schema conceptually (fields for Input Content, Output Format Requirements, Domain Tags, model-specific adapters) but does not provide actual prompt text. No appendix with prompts. Only natural-language descriptions of what prompts contain." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "Some system parameters are reported: scoring thresholds (0.92, 0.60), smoothing constant ε = 0.1, pilot set size 5%. However, key LLM inference parameters (temperature, top-p, max tokens) are not stated for either the evaluated models or the judge model." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "ReLE is a benchmark evaluation system, not an agentic scaffold. While it evaluates agent tasks, the system itself does not use agentic scaffolding (no tool use, retry logic, or memory management)." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4.1 documents three data sources with percentages (45% fresh, 35% adapted, 20% private). Section 4.3 describes the multi-level decontamination pipeline: 13-gram overlap filtering, semantic deduplication via BGE-M3 (threshold > 0.85), symbolic solver validation (SymPy/WolframAlpha), and 10% human expert review." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 7.2 'Limitations & Future Directions' is a dedicated subsection discussing specific limitations including domain coverage, mechanism understanding, and multi-modal evaluation." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 7.2 identifies specific limitations: only 7 domains covered (IoT missing), multi-modal evaluation is limited, internal mechanisms causing anisotropy require further study. Section 6.1.1 tests whether RSA instability is a measurement artifact. Section 3.2 addresses judge bias specifically." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper explicitly states it covers Chinese LLMs across 7 domains (Section 7.2), positions ReLE as 'not a replacement for comprehensive static benchmarks' (Abstract/Section 1), and notes the framework is 'instantiated with Chinese-language benchmarks in this work' (Section 7.1)." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "Section 8 mentions plans to 'open-source ReLE's failure case repository (2.1M instances) and evaluation infrastructure' but this is a future intention. No download links or supplementary data are currently provided. The private domain set (20%) will never be available." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 4.1 describes three data sources: Dynamic Fresh Set (June 2024–Jan 2026), Solver-Verified Academic Refinement from Math24O etc., and Domain-Specific Private Set from industry partners. Section 4.3 describes real exam questions (2025 Gaokao), 56 domain experts for custom samples, and adapted academic datasets." 198 }, 199 "recruitment_methods_described": { 200 "applies": true, 201 "answer": false, 202 "justification": "56 domain experts are mentioned for creating custom domain samples (Section 4.3) but their recruitment, qualifications, and selection process are not described. Industry partners providing the private dataset (20%) are unnamed ('finance/medical')." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is documented: data sourcing (3 sources with percentages) → value perturbation for academic sets → symbolic solver validation → semantic deduplication (BGE-M3) → 13-gram overlap filtering → human expert review (10%) → Private Anchor Set isolation. Section 4.3 provides filtering criteria." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding section, acknowledgments, or grant information is provided. Authors are affiliated with commercial entities (Huawei, NSFOCUS Technologies, Ping An Insurance) but no funding disclosure is made." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are listed: Sun Yat-sen University, HKUST-GZ, NSFOCUS Technologies Co. Ltd, Huawei, and Ping An Property & Casualty Insurance Company of China. All affiliations appear on the first page." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "No funding is disclosed. Authors from Huawei and other companies may have products among the 304 evaluated models, creating a potential undisclosed conflict. Without funding disclosure, independence cannot be assessed." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests statement is present. Authors from Huawei and NSFOCUS (commercial AI/security companies) may have financial interests related to model evaluation rankings. Absence of disclosure is not absence of conflict." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "Section 4.1 mentions 'zero overlap with training data cutoffs of models released before mid-2025' for the fresh set, acknowledging cutoffs exist. However, specific training data cutoff dates for the 304 evaluated models are not stated." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": true, 241 "justification": "Section 4.3 extensively discusses train/test overlap: 13-gram overlap checks against CommonCrawl/C4, embedding-based semantic deduplication (cosine > 0.85 threshold against pre-October 2025 web snapshots), and Private Anchor Set (PAS) with Generalization Gap analysis (∆g > 15% flags overfitting)." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": true, 246 "justification": "Section 4.1 addresses contamination with a 45% Dynamic Fresh Set (June 2024–Jan 2026) ensuring zero overlap with pre-mid-2025 training data. Section 4.3 implements multi-level decontamination (n-gram + semantic). The Private Held-out Validation set provides contamination-resistant validation." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "The paper evaluates AI models, not human participants. The 56 domain experts are content creators for the benchmark, not study participants." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human subjects research is conducted. The study evaluates LLM capabilities on benchmark tasks." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in the study. Model characteristics (commercial vs. open-source, parameter scale) are reported instead." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants. Model inclusion criteria (304 models covering diverse architectures and scales) are described in Section 4.4." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants or experimental conditions assigned to humans." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants. The evaluation is automated." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants to experience attrition." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "Costs are a central focus: $69,000 for traditional full-set evaluation vs. $20,700 for ReLE (70% reduction). Average cost of $230 per model is stated (footnote 1, Section 1). Price-performance analysis (1-5 yuan models) in Section 5.2. Latency analysis in Section 6.4." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": true, 295 "justification": "Total evaluation cost is stated: $20,700 for 304 models with dynamic sampling vs. $69,000 for full-set evaluation. Section 6.4 discusses compute efficiency as a first-class evaluation dimension." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No mention of random seed sensitivity analysis. Model evaluations appear to be single-pass. The bootstrap resampling (1,000 iterations) is for statistical testing of RSA, not for testing seed sensitivity of model evaluations." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of evaluation runs per model is not explicitly stated. Bootstrap iterations (1,000) are stated for statistical tests but these are resampling analyses, not separate model evaluation runs." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "System parameters (scoring thresholds 0.92/0.60, ε = 0.1, pilot set 5%) appear manually set. No hyperparameter search budget, search method, or total compute spent on system parameter tuning is reported." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": false, 317 "justification": "The choice of system configuration (scoring thresholds, sampling parameters, weighting schemes) is not justified through systematic comparison of alternatives. The three weighting schemes (General, Professional, Reasoning) are presented without explaining why these specific schemes were chosen." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper compares 304 models across 22 dimensions and 7 domains but does not mention multiple comparison correction (Bonferroni, Holm, etc.). While some p-values are very small (p < 10^-5), the paper makes many comparisons without addressing family-wise error rate." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors present their own benchmark and evaluate models on it without acknowledging the potential bias of designing and scoring a benchmark to demonstrate specific findings (e.g., that RSA is higher in ReLE than baselines). No independent evaluation or bias mitigation discussed." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "Section 5.2 analyzes price-performance tradeoffs: '1-5 yuan models achieve comparable performance to high-priced models (≥5 yuan) in 8 out of 22 dimensions.' Section 6.4 reports latency analysis and non-linear cost-performance relationships. Section 3.3 reports cost savings explicitly." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": true, 337 "justification": "Construct validity is a central theme. The paper argues aggregate scores mask capability anisotropy (Section 1.1), introduces the Anisotropy Index to measure dimensional independence (Section 5.1), distinguishes 'latent capability and interface compliance' (Section 5.1), and uses the Generalization Gap to flag benchmark overfitting (Section 4.3)." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "Section 4.4 describes a Unified API-First Evaluation Protocol that standardizes the access layer across all 304 models. Section 3.1 describes a Model-Specific Adapter Layer ensuring 'performance differences reflect reasoning capability rather than instruction-following failures.' Section 5.1 explicitly discusses format alignment confounds." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": true, 349 "justification": "Section 4.1 addresses temporal leakage with a Dynamic Fresh Set (45%) of samples authored June 2024–Jan 2026, 'ensuring zero overlap with training data cutoffs of models released before mid-2025.' Temporal segmentation is a core design principle." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "The paper does not discuss whether the evaluation setup leaks answer information through context, prompt formatting, or task structure. Feature leakage (whether input features contain information unavailable at prediction time) is not addressed." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "The paper uses adapted academic datasets (35%) with value perturbation and symbolic solver validation, but does not explicitly discuss whether the adapted samples maintain independence from training data beyond surface-level perturbation. Structural similarities between original and perturbed samples are not analyzed." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": true, 364 "justification": "Section 4.3 applies concrete detection methods: 13-gram overlap checks against CommonCrawl/C4, embedding-based semantic deduplication (BGE-M3, cosine > 0.85 against pre-October 2025 web snapshots), and Private Anchor Set (PAS) with Generalization Gap computation (∆g > 15% flags overfitting)." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "ReLE's dynamic variance-aware scheduler reduces evaluation costs by 70% compared to full-set evaluation while maintaining ranking correlation of ρ = 0.96.", 371 "evidence": "Table 4 shows dynamic sampling yields RSA 11.4 vs full-set 10.8, with Spearman ρ = 0.96 (p < 10^-5). Cost: $20,700 vs $69,000 (Section 1.2, 3.3).", 372 "supported": "strong" 373 }, 374 { 375 "claim": "Models exhibit significantly higher ranking instability in ReLE (mean RSA 11.4) versus traditional benchmarks (RSA ~5.0), demonstrating capability anisotropy.", 376 "evidence": "Section 6.1: 65% of models show RSA ≥10, 23% show RSA ≥20. KS test D = 0.42, p < 10^-5. Bootstrap 95% CI: ReLE [10.2, 12.6] vs baselines [4.1, 5.9]. Control experiment (Table 4) shows 94.8% of instability is structural.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "LLM capability is highly anisotropic (Anisotropy Index = 0.74), meaning performance in one domain is a poor predictor of performance in another.", 381 "evidence": "Section 5.1 computes Ianiso from average Pearson correlation across 22 dimensions. Cross-dimensional correlations between professional and general domains are low (r = 0.26). Visualized in Figure 2 radar plots.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "41% of evaluated models exhibit signs of overfitting to public benchmarks, scoring high on C-Eval (avg 73.2) but failing on ReLE's professional sub-tasks (avg 48.5).", 386 "evidence": "Section 6.3. However, the gap could reflect task difficulty differences rather than overfitting. No formal overfitting test (e.g., comparing generalization gap distributions) is provided beyond the aggregate score difference.", 387 "supported": "weak" 388 }, 389 { 390 "claim": "Agent-specialized models significantly outperform general-purpose commercial models in Tool Use (74.8 vs 62.4), driven by instruction tuning rather than parameter scale.", 391 "evidence": "Table 3 shows the score gap. Section 5.2 reports correlation with instruction tuning (r = 0.65) vs scale (r = 0.48). Section 5.1 attributes the gap to 'Format Alignment' with function-calling schemas.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "Mid-priced commercial models (1-5 yuan) match expensive models (≥5 yuan) in 8 of 22 dimensions with average score difference ≤3.2%.", 396 "evidence": "Section 5.2 states this finding. However, detailed price-performance data (which 8 dimensions, specific model comparisons) is not shown in tables.", 397 "supported": "weak" 398 } 399 ], 400 "methodology_tags": ["benchmark-eval"], 401 "key_findings": "ReLE evaluates 304 Chinese LLMs across 207,843 samples in a Domain × Capability matrix, revealing significant capability anisotropy (Ianiso = 0.74) masked by aggregate scores. Rankings are highly unstable under different capability weightings (mean RSA 11.4 vs ~5.0 in traditional benchmarks, p < 10^-5), with 94.8% of instability attributable to structural capability differences rather than measurement noise. The system's variance-aware adaptive scheduler reduces evaluation costs by 70% while preserving ranking fidelity (ρ = 0.96). Notably, 41% of models show benchmark overfitting patterns and agent-specialized models outperform general models in tool use through instruction tuning rather than parameter scale.", 402 "red_flags": [ 403 { 404 "flag": "Undisclosed funding and potential conflicts", 405 "detail": "Authors are affiliated with Huawei, NSFOCUS Technologies, and Ping An Insurance — companies that may have models among the 304 evaluated. No funding disclosure, competing interests statement, or conflict-of-interest acknowledgment is provided." 406 }, 407 { 408 "flag": "Overfitting claim conflates task difficulty with contamination", 409 "detail": "Section 6.3 claims 41% of models show 'Popular Benchmark Overfitting' based on scoring higher on C-Eval (avg 73.2) than ReLE professional sub-tasks (avg 48.5). This gap could simply reflect that niche professional tasks are inherently harder than general benchmarks, not that models are overfitted. No formal overfitting test is provided." 410 }, 411 { 412 "flag": "No code, data, or artifact release", 413 "detail": "Despite evaluating 304 models across 207,843 samples with a claimed 2.1M+ failure case repository, nothing is released. All claims rely on trust in the authors' system. The 20% proprietary dataset from unnamed industry partners can never be verified." 414 }, 415 { 416 "flag": "Judge model evaluating itself", 417 "detail": "GPT-4o-0513 is used as the primary judge for semi-objective tasks (24% of scoring). GPT-4o variants are among the 304 evaluated models. While concordance with secondary judges (Pearson r = 0.88) is reported, the primary judge may still have systematic preferences for its own family's output patterns." 418 }, 419 { 420 "flag": "Selective reporting of price-performance data", 421 "detail": "The claim that '1-5 yuan models match expensive models in 8 of 22 dimensions' (Section 5.2) does not show which 8 dimensions, which specific models, or the full price-performance analysis. This makes the claim difficult to verify." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "Holistic evaluation of language models", 427 "authors": ["P. Liang", "R. Bommasani", "T. Lee"], 428 "year": 2022, 429 "relevance": "Foundational LLM evaluation framework (HELM); directly compared as baseline for ReLE's diagnostic approach." 430 }, 431 { 432 "title": "C-Eval: A comprehensive evaluation of Chinese language models", 433 "authors": ["Y. Huang", "C. Liang", "X. Du"], 434 "year": 2021, 435 "relevance": "Major Chinese LLM benchmark used as baseline for RSA comparison, demonstrating benchmark saturation." 436 }, 437 { 438 "title": "OpenCompass: A Universal Evaluation Platform for Foundation Models", 439 "authors": ["OpenCompass Contributors"], 440 "year": 2024, 441 "arxiv_id": "2307.06233", 442 "relevance": "Leading open-source Chinese LLM evaluation platform; directly compared against ReLE's design philosophy." 443 }, 444 { 445 "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference", 446 "authors": ["W.-L. Chiang", "L. Zheng", "Y. Sheng"], 447 "year": 2024, 448 "arxiv_id": "2403.04132", 449 "relevance": "Preference-based LLM evaluation framework; contrasted with ReLE's structured diagnostic approach." 450 }, 451 { 452 "title": "LiveBench: A Challenging, Contamination-Free LLM Benchmark", 453 "authors": ["C. White", "S. Dooley", "S. Manjunatha"], 454 "year": 2024, 455 "arxiv_id": "2406.19314", 456 "relevance": "Dynamic benchmark addressing contamination via continuous updates; compared with ReLE's adaptive approach." 457 }, 458 { 459 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 460 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 461 "year": 2023, 462 "relevance": "Cost-aware LLM usage framework; aligns with ReLE's efficiency-first evaluation philosophy." 463 }, 464 { 465 "title": "CompassJudger-1: All-in-one Judge Model Helps Model Evaluation and Evolution", 466 "authors": ["OpenCompass Team"], 467 "year": 2025, 468 "arxiv_id": "2410.16256", 469 "relevance": "Judge-model-based evaluation method; discussed as orthogonal to ReLE's structured evaluation approach." 470 }, 471 { 472 "title": "From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline", 473 "authors": ["T. Li", "W.-L. Chiang", "E. Frick"], 474 "year": 2024, 475 "arxiv_id": "2406.11939", 476 "relevance": "Dynamic benchmark pipeline from live data; compared with ReLE's adaptive evaluation approach." 477 }, 478 { 479 "title": "The Llama 3 Herd of Models", 480 "authors": ["AI @ Meta"], 481 "year": 2024, 482 "arxiv_id": "2407.21783", 483 "relevance": "Major open-source LLM family; cited as example of models that have saturated traditional benchmarks." 484 }, 485 { 486 "title": "Length-Controlled AlpacaEval: A Simple Way to Debias Automatic Evaluators", 487 "authors": ["Y. Dubois", "X. Li", "R. Taori"], 488 "year": 2024, 489 "arxiv_id": "2404.04475", 490 "relevance": "Addresses evaluator bias in LLM assessment, relevant to ReLE's judge bias mitigation approach." 491 }, 492 { 493 "title": "CMMLU: Measuring massive multitask language understanding in Chinese", 494 "authors": ["H. Li", "Y. Zhang", "F. Koto"], 495 "year": 2023, 496 "arxiv_id": "2306.09212", 497 "relevance": "Chinese-specific multitask LLM benchmark; part of the evaluation landscape ReLE addresses." 498 }, 499 { 500 "title": "AlignBench: Benchmarking Chinese Alignment", 501 "authors": ["Y. Liu", "Y. Duan", "Y. Zhang"], 502 "year": 2024, 503 "arxiv_id": "2311.18743", 504 "relevance": "Chinese LLM alignment benchmark; compared with ReLE's multi-dimensional decomposition approach." 505 } 506 ] 507 }