scan.json (28604B)
1 { 2 "paper": { 3 "title": "HybridFlow: Resource-Adaptive Subtask Routing for Efficient Edge-Cloud LLM Inference", 4 "authors": [ 5 "Jiangwen Dong", 6 "Jiayu Li", 7 "Tianhang Zheng", 8 "Wanyu Lin" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2512.22137" 13 }, 14 "scan_version": 2, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "HybridFlow decomposes complex queries into dependency-aware DAGs and routes subtasks between edge and cloud models using a learned utility predictor with adaptive budget-aware thresholds. On GPQA, MMLU-Pro, AIME24, and LiveBench-Reasoning, it achieves 55.34% average accuracy (competitive with cloud-only CoT at 58.99%) while reducing latency by 28.5% and API cost by 31.3% versus the best collaborative baseline (DoT). The adaptive router substantially outperforms any fixed-threshold policy (utility 0.7940 vs peak 0.6329), and a model-pair swap experiment suggests transferability beyond the primary Llama3.2-3B/GPT-4.1 configuration.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract states 'Code: HybridFlow' which appears to be a hyperlinked repository reference. A code artifact is indicated as available." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "All evaluation benchmarks (GPQA, MMLU-Pro, AIME24, LiveBench-Reasoning) are standard public datasets. Router training uses MMLU-Pro and Math500, also public." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions 'single NVIDIA RTX 3090 GPU' and model names but provides no requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are provided in the paper or appendix. The implementation details section describes the system but does not include commands or scripts to replicate experiments." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": true, 45 "justification": "Tables 1 and 2 report mean ± std for all metrics across all benchmarks (e.g., '53.33±2.03' for GPQA accuracy)." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims HybridFlow outperforms baselines based on comparing point estimates with std. No statistical significance tests (t-tests, bootstrap tests, etc.) are performed despite multiple comparative claims." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Results are reported with absolute values and baseline context throughout, e.g., 'HybridFlow achieves an average Ctime of 17.48 s, outperforming HybridLLM (24.45 s)' and accuracy comparisons with both numbers provided." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification for benchmark sizes or number of profiling queries (2,000). No power analysis or discussion of whether sample sizes are adequate for the claims." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": true, 65 "justification": "Standard deviation is reported across runs in Tables 1 and 2 (e.g., '53.33±2.03', '15.24±0.30')." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Six baselines are included: Direct Prompt, CoT, SoT, PASTA, HybridLLM, and DoT. Both single-model and collaborative paradigms are represented." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include PASTA (2025), DoT (2025), HybridLLM (2024), and SoT (2024). These are recent and representative of the state of the art." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section 4.3 provides ablation studies: Edge-only, Cloud-only, Random routing, Fixed Threshold, and HybridFlow-Chain (disabling DAG parallelism). Table 3 isolates routing vs scheduling contributions." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Four metrics are used: Accuracy, Ctime (latency), CAPI (API cost), and Utility (unified benefit-cost ratio). Tables 1 and 2 report all four." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "No human evaluation is included. All evaluation is automated via benchmark answer matching. The paper makes claims about 'reasoning quality' but relies solely on automated accuracy metrics." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "The router is trained on '2,000 sampled queries drawn from two benchmarks: MMLU-Pro (different from the main test samples) and Math500' (Appendix C). Evaluation benchmarks include GPQA and AIME24 which are not in the training set." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down per benchmark (GPQA, MMLU-Pro, AIME24, LiveBench-Reasoning) in Tables 1 and 2, and per subtask position in Figure 3." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "No failure case analysis is provided. Table 5 shows 9-10% of queries fall back to chain plans, but the impact on accuracy for these cases is not discussed. No qualitative error analysis." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Table 3 shows HybridFlow-Chain achieves lower accuracy (50.62%) and utility (0.6095) than full HybridFlow (53.33%, 0.7940), and Table 6 shows how aggressive thresholds degrade performance. These demonstrate configurations that don't work well." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims HybridFlow 'improves the cost-accuracy trade-off, reducing latency and cloud API usage while maintaining competitive accuracy.' Tables 1-2 confirm: 55.34% avg accuracy with 17.48s latency and 0.0088 CAPI, outperforming collaborative baselines on all three dimensions." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Causal claims like 'parallel execution substantially reduces end-to-end latency' and 'Router is crucial for achieving a balance' are supported by controlled ablations in Table 3 (e.g., HybridFlow-Chain removes DAG parallelism; Edge/Cloud/Random remove router). Single-variable manipulation in ablations is adequate." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims 'Edge-Cloud LLM Inference' broadly, but results are primarily on one edge/cloud model pair (Llama3.2-3B/GPT-4.1) across four reasoning benchmarks. The conclusion states 'the promise of our adaptive, parallel-aware routing framework for orchestrating efficient edge-cloud AI' without bounding to the tested setting. One model-swap experiment (Appendix D.2) is acknowledged as limited." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No discussion of alternative explanations for the results. For example, gains could be partially due to the specific model pair characteristics, benchmark properties, or the planner's decomposition style. No robustness checks beyond the single model-swap experiment." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper frames benchmark accuracy as 'reasoning quality' (e.g., 'preserving reasoning quality' in abstract, 'reasoning quality and system efficiency' in Sec 4.1) without discussing the gap between automated benchmark correctness and actual reasoning quality." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "Models are named as 'Llama3.2-3B', 'GPT-4.1', 'qwen3-embedding-0.6b', 'Qwen2.5-7B', 'DeepSeek-V3'. GPT-4.1 lacks a snapshot date or API version. Per schema requirements, marketing names without version identifiers do not count." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "The full planner prompt is provided in Figure 6 with complete text, including XML format constraints, examples, and EAG structure. Figure 7 shows a complete case study with actual subtask descriptions and outputs." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Key hyperparameters are reported: temperature 0.6, AdamW learning rate 1e-4, τ0=0.2, Kmax=0.02, Lmax=20, ε=10^-4, nmax=7, Rmax=2, normalization scales 10 (latency) and 0.02 (API cost)." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "The entire scaffolding is described in detail: DAG decomposition pipeline, validation and repair procedure, scheduler queue, router mechanism with utility prediction and adaptive thresholding, and online calibration via contextual bandits. Algorithm 1 formalizes the full procedure." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Appendix C documents the profiling data pipeline: 2,000 queries from MMLU-Pro and Math500, paired edge/cloud executions, cached outputs recombined into mixed routing vectors, marginal effect estimation by toggling subtask assignments. Normalization constants and utility target computation are specified." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "No dedicated limitations section exists. The conclusion makes no mention of limitations. Appendix D.2 briefly notes 'broader coverage of model families is left for future work' but this is not a substantive limitations discussion." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No threats to validity are discussed anywhere in the paper. There is no analysis of specific threats such as planner failure rates affecting results, model-pair specificity, or benchmark selection bias." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show (e.g., not tested on generation tasks, not tested with models of other scales, not tested under real network variability)." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw data is available. Per-query results, profiling data, router training data, and detailed execution logs are not released." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Appendix C describes profiling data collection: 2,000 queries from MMLU-Pro and Math500, paired execution on edge and cloud, evaluation via task-specific verifiers, and marginal correctness computation via reuse-and-recombine strategy." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. All data comes from standard public benchmarks." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "Appendix C documents the full pipeline: query decomposition → paired edge/cloud execution → cached output recombination → marginal effect estimation → utility target computation → router training with MSE loss. Normalization and clipping procedures are specified." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly listed: Hong Kong Polytechnic University (Departments of Data Science and AI, and Computing) and Zhejiang University. None are affiliated with the evaluated models' companies." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding source is disclosed, so independence of funder cannot be assessed. The paper evaluates GPT-4.1 (OpenAI) and Llama3.2-3B (Meta) — without a funding statement, potential conflicts remain unknown." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "No training data cutoff dates are stated for GPT-4.1, Llama3.2-3B, or any model used. This is relevant since GPQA and MMLU-Pro could be in the training data." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether GPT-4.1 or Llama3.2-3B were trained on GPQA, MMLU-Pro, AIME24, or LiveBench-Reasoning data." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "GPQA, MMLU-Pro, and AIME24 were all published before GPT-4.1's likely training cutoff. LiveBench is designed to mitigate contamination. No contamination discussion is provided for any benchmark." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Tables 1 and 2 report per-query latency (Ctime in seconds) and API cost (CAPI in dollars) for all methods across all benchmarks. Cost is a central evaluation metric." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "The paper mentions 'single NVIDIA RTX 3090 GPU' for edge computation but does not state total GPU hours, training time for the router, or total API spend for profiling the 2,000 training queries." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "Tables report mean ± std but do not explicitly discuss random seed sensitivity. It is unclear whether variance comes from different seeds, sampling randomness at temperature 0.6, or other sources." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "Results are reported as 'mean ± std' (Table 1 caption) but the exact number of runs is never stated." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "Hyperparameters are described as 'empirically set ... based on preliminary tuning across all benchmarks' (Appendix C) but the number of configurations tried or compute spent on tuning is not reported." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "While Table 6 shows a sweep of fixed thresholds and the optimization framework motivates adaptive thresholding, the final configuration (τ0=0.2, Kmax=0.02, Lmax=20) is 'empirically set' without explaining the selection criterion or validation procedure." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Multiple methods are compared across multiple benchmarks with no correction for multiple comparisons and no significance tests at all." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors implement and evaluate their own system against their reimplementations of baselines. No acknowledgment of potential author-evaluation bias per Lucic et al. (2018)." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": true, 330 "justification": "Performance is explicitly plotted as a function of compute budget. Figure 4 shows accuracy, cost, and utility across threshold values. Table 6 provides detailed cost-performance data. The Utility metric (Eq. 2) directly captures performance per unit cost." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "No discussion of whether GPQA, MMLU-Pro, AIME24, or LiveBench-Reasoning actually measure the 'reasoning quality' and 'edge-cloud inference efficiency' the paper claims to evaluate. Benchmarks are used without questioning construct validity." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": true, 339 "answer": false, 340 "justification": "Main comparisons mix scaffolding and routing differences: HybridFlow (DAG + adaptive routing) vs DoT (sequential + routing) vs HybridLLM (query-level routing). The scaffold effect is not isolated from the routing effect in the main results. The ablation (Table 3, HybridFlow-Chain) partially addresses this but the main cross-method comparisons do not." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of temporal leakage. GPT-4.1 may have been trained on data that includes solutions to GPQA, MMLU-Pro, or AIME24 problems." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the evaluation setup leaks information. The DAG decomposition by the planner could provide structural hints that wouldn't be available in standard inference, but this is not discussed." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The router training uses MMLU-Pro samples 'different from the main test samples' showing some awareness, but no formal independence verification. No discussion of LLM training data overlap with benchmarks." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No concrete leakage detection or prevention methods are used (no canary strings, membership inference, n-gram overlap analysis, or decontamination)." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "HybridFlow achieves competitive accuracy (55.34% avg) while reducing latency and API cost compared to collaborative baselines.", 369 "evidence": "Table 1 shows 55.34% avg accuracy vs DoT 46.50% and HybridLLM 38.70%. Table 2 shows Ctime 17.48s vs HybridLLM 24.45s and DoT 18.32s; CAPI 0.0088 vs HybridLLM 0.0128 and DoT 0.009.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Dependency-aware DAG planning enables parallelism that substantially reduces end-to-end latency.", 374 "evidence": "Table 3 ablation: HybridFlow achieves 15.24s latency and 0.7940 utility vs HybridFlow-Chain (no DAG parallelism) at 16.12s and 0.6095 utility on GPQA. Latency improvement from DAG parallelism is modest (~0.88s).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "The adaptive routing mechanism outperforms any fixed-threshold policy.", 379 "evidence": "Table 3 shows adaptive routing utility 0.7940 vs best fixed threshold (τ0=0.6) utility 0.6329 in Table 6. Table 6 provides a comprehensive threshold sweep from τ0=0 to τ0=1.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "HybridFlow's design generalizes beyond the Llama3.2-3B/GPT-4.1 model pair.", 384 "evidence": "Table 8 shows one model-swap experiment (Qwen2.5-7B/DeepSeek-V3) on GPQA only. HybridFlow maintains advantages (53% accuracy, $1.16e-3 API cost) vs DoT (49%, $1.80e-3) under the swapped pair.", 385 "supported": "weak" 386 }, 387 { 388 "claim": "HybridFlow's router concentrates cloud usage on early, high-impact subtasks and shifts to edge for later subtasks.", 389 "evidence": "Figure 3 shows position-dependent offloading pattern on GPQA with cloud calls decreasing and adaptive threshold increasing at later subtask positions.", 390 "supported": "moderate" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "No limitations section", 396 "detail": "The paper contains no limitations, threats-to-validity, or scope-boundaries discussion. For a systems paper making broad claims about edge-cloud LLM inference, the absence of any limitations acknowledgment is a significant omission." 397 }, 398 { 399 "flag": "Single primary model pair", 400 "detail": "Main results use only Llama3.2-3B/GPT-4.1. The single model-swap experiment (Qwen2.5-7B/DeepSeek-V3) is limited to GPQA only. Claims about general edge-cloud inference are weakly supported by this narrow evaluation." 401 }, 402 { 403 "flag": "No statistical significance tests", 404 "detail": "Multiple comparative claims are made by comparing point estimates with standard deviations, but no significance tests are performed. Several results overlap in their error ranges (e.g., HybridFlow 53.33±2.03 vs HybridLLM 52.9±0.94 on GPQA)." 405 }, 406 { 407 "flag": "Planner failure rate not analyzed", 408 "detail": "Table 5 shows 9-10% of queries fall back to chain plans (sequential execution), yet the impact on accuracy and efficiency for these fallback cases is not reported separately. This could mask systematic failures on certain query types." 409 }, 410 { 411 "flag": "Number of experimental runs not stated", 412 "detail": "Results are reported as mean ± std but the number of runs is never specified. The reader cannot assess whether variance estimates are reliable." 413 }, 414 { 415 "flag": "Contamination risk unaddressed", 416 "detail": "GPT-4.1 may have been trained on GPQA, MMLU-Pro, and AIME24 benchmark data. Since HybridFlow routes subtasks to GPT-4.1, contamination could inflate accuracy results for cloud-routed subtasks, making the routing appear more effective than it is." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 422 "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang", "Robert Sim", "Subhabrata Mukherjee", "Victor Rühle", "Laks V. S. Lakshmanan", "Ahmed H. Awadallah"], 423 "year": 2024, 424 "relevance": "Cost-efficient LLM query routing between small and large models, directly comparable baseline for edge-cloud collaboration." 425 }, 426 { 427 "title": "Division-of-Thoughts: Harnessing Hybrid Language Model Synergy for Efficient On-Device Agents", 428 "authors": ["Chenhan Shao", "Xiaohan Hu", "Yilu Lin", "Fangkai Xu"], 429 "year": 2025, 430 "relevance": "Hybrid language model collaboration for on-device agents with sequential reasoning delegation." 431 }, 432 { 433 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 434 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 435 "year": 2023, 436 "arxiv_id": "2305.05176", 437 "relevance": "Cost-aware LLM cascade strategies for budget-quality trade-offs in LLM inference." 438 }, 439 { 440 "title": "SplitReason: Learning to Offload Reasoning", 441 "authors": ["Yash Akhauri", "Andy Fei", "Chia-Chih Chang", "Amr F. AbouElhamayed", "Yejin Li", "Mohamed S. Abdelfattah"], 442 "year": 2025, 443 "arxiv_id": "2504.16379", 444 "relevance": "Fine-grained reasoning step offloading from edge to cloud models, closely related to subtask-level routing." 445 }, 446 { 447 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 448 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Fei Xia", "Ed Chi", "Quoc V. Le", "Denny Zhou"], 449 "year": 2022, 450 "relevance": "Foundational work on chain-of-thought reasoning that HybridFlow extends with parallel decomposition and routing." 451 }, 452 { 453 "title": "Skeleton-of-Thought: Prompting LLMs for Efficient Parallel Generation", 454 "authors": ["Xuefei Ning", "Zinan Lin", "Zixuan Zhou", "Zifu Wang", "Huazhong Yang", "Yu Wang"], 455 "year": 2024, 456 "relevance": "Parallel LLM generation via skeleton decomposition, one of the main baselines for parallel inference." 457 }, 458 { 459 "title": "Graph of Thoughts: Solving Elaborate Problems with Large Language Models", 460 "authors": ["Maciej Besta", "Nils Blach", "Ales Kubicek"], 461 "year": 2024, 462 "relevance": "Graph-based reasoning with LLMs enabling flexible dependency patterns, precursor to DAG-based decomposition." 463 }, 464 { 465 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 466 "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"], 467 "year": 2023, 468 "relevance": "Tree-structured LLM reasoning with deliberate search, influential structured inference approach." 469 }, 470 { 471 "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark", 472 "authors": ["David Rein", "Betty Li Hou", "Asa Cooper Stickland"], 473 "year": 2024, 474 "relevance": "Graduate-level reasoning benchmark used as primary evaluation dataset." 475 }, 476 { 477 "title": "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark", 478 "authors": ["Yubo Wang", "Xueguang Ma", "Ge Zhang"], 479 "year": 2024, 480 "arxiv_id": "2406.01574", 481 "relevance": "Enhanced multi-task benchmark for LLM evaluation, used as evaluation and router training data source." 482 }, 483 { 484 "title": "LiveBench: A Challenging, Contamination-Limited LLM Benchmark", 485 "authors": ["Colin White", "Samuel Dooley", "Manley Roberts"], 486 "year": 2025, 487 "relevance": "Contamination-resistant benchmark designed to provide clean LLM evaluation signals." 488 }, 489 { 490 "title": "S-DAG: A Subject-Based Directed Acyclic Graph for Multi-Agent Heterogeneous Reasoning", 491 "authors": ["Jiangwen Dong", "Zhiqi Lin", "Wanyu Lin", "Mingxuan Zhang"], 492 "year": 2025, 493 "arxiv_id": "2511.06727", 494 "relevance": "DAG-based multi-agent reasoning framework by same first author, precursor to HybridFlow's decomposition approach." 495 } 496 ] 497 }