scan.json (28655B)
1 { 2 "paper": { 3 "title": "PrefillShare: A Shared Prefill Module for KV Reuse in Multi-LLM Disaggregated Serving", 4 "authors": [ 5 "Sunghyeon Woo", 6 "Hoseung Kim", 7 "Sunghwan Shim", 8 "Minjung Jo", 9 "Hyunjoon Jeong", 10 "Jeongtae Lee", 11 "Joonghoon Kim", 12 "Sungjae Lee", 13 "Baeseong Park", 14 "Se Jung Kwon", 15 "Dongsoo Lee" 16 ], 17 "year": 2026, 18 "venue": "arXiv", 19 "arxiv_id": "2602.12029" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "methodology_tags": ["benchmark-eval"], 24 "key_findings": "PrefillShare decouples LLM inference into a shared frozen prefill module and task-specific decode modules, enabling cross-model KV cache reuse in disaggregated serving. Cache-conditioned fine-tuning preserves accuracy within 1% of full fine-tuning across math, coding, and tool-calling benchmarks on LLaMA-3.1-8B and Qwen3-1.7B/8B/14B. In multi-model agent serving with ReAct and Reflexion workloads, PrefillShare achieves up to 4.5× lower p95 latency and 3.9× higher throughput by maintaining ~89% prefix cache hit ratio at high concurrency, compared to a disaggregated baseline where hit ratio degrades sharply beyond ~40 concurrent sessions.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. The system is described as built atop vLLM but no implementation is released." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "All training and evaluation datasets are publicly available: MetaMathQA-40K, EvolInstruct-Code-80K, xLAM-function-calling-60K, GSM8K, GSM+, HumanEval, HumanEval+, and BFCL." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper mentions A100-SXM4-80G GPUs, vLLM, and LMFlow framework, but provides no requirements.txt, Dockerfile, or detailed dependency list with library versions." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Training hyperparameters are listed in Appendix A but there are no instructions for reproducing the serving experiments." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "All accuracy results in Tables 1 and 2 are reported as point estimates (e.g., '71.4', '49.3') with no confidence intervals or error bars. Serving performance figures show single curves without uncertainty bands." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper claims PrefillShare 'achieves accuracy comparable to Full-FT' and reports up to 4.5× latency improvement, but no statistical significance tests are used to support these comparative claims." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "The paper reports relative improvements with baseline context: '4.5× lower p95 latency', '3.9× higher throughput', 'accuracy within 1% of Full-FT', and specific numbers for both systems (e.g., cache hit ratio 89% vs 60% peak for baseline)." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "No justification for dataset sizes (40K, 80K, 60K training examples) or why only one epoch of fine-tuning was used. No power analysis or sample size rationale for serving experiments." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "No standard deviations, variance across seeds, or spread measures are reported for any result. All tables and figures show single-run numbers." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "For accuracy: compares against Full-FT and Inherent (base model) baselines. For serving: compares against a disaggregated serving baseline with the same GPU budget. Naive sharing is also shown in Figure 2." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "Full fine-tuning is the standard and appropriate baseline for the accuracy claims. The disaggregated serving baseline reflects current practice. Related work discusses DroidSpeak and KVComm as contemporaries but does not directly compare against them." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": true, 89 "justification": "Figure 2 shows accuracy as a function of KV cache sharing ratio (0% to 100%), demonstrating the effect of cache-conditioned fine-tuning versus naive sharing. This isolates the contribution of the training procedure." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": true, 94 "justification": "Accuracy is evaluated across 6 benchmarks (GSM8K, GSM+, HumanEval, HumanEval+, BFCL Simple Python, BFCL Multiple). Serving performance uses p95 latency, throughput, TTFT, and prefix cache hit ratio." 95 }, 96 "human_evaluation": { 97 "applies": false, 98 "answer": false, 99 "justification": "Human evaluation is irrelevant to the claims. The accuracy claims are about matching scores on automated benchmarks (math solving, code generation, function calling). The serving claims are about latency and throughput metrics." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "Training uses MetaMathQA, EvolInstruct-Code, and xLAM-function-calling datasets. Evaluation uses separate benchmarks: GSM8K/GSM+ for math, HumanEval/HumanEval+ for coding, BFCL for tool calling." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Table 1 breaks down by domain (math, coding, tool calling) and by model (LLaMA3.1-8B, Qwen3-8B). Table 2 breaks down by model scale (1.7B, 8B, 14B). Serving results are broken down by workload type (ReAct, Reflexion)." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "Figure 2 explicitly shows failure of naive sharing (accuracy collapse at high sharing ratios). Appendix B.2 discusses throughput degradation at extreme concurrency from KV staging/reload overhead. Figure 4 shows throughput saturation and decline." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The naive sharing approach is shown to collapse in Figure 2. Appendix B.2 acknowledges throughput decline at high concurrency due to handoff overhead. Figure 6 (Qwen3-14B) shows TTFT increases at higher arrival rates under PrefillShare." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "Abstract claims 'matches full fine-tuning accuracy on a broad range of tasks and models' is supported by Tables 1-2 (within 1% across 6 benchmarks, 2 model families, 3 scales). '4.5× lower p95 latency and 3.9× higher throughput' is supported by Figure 3 (Reflexion workload at highest load)." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": true, 131 "justification": "Causal claims ('PrefillShare enables', 'cache-conditioned fine-tuning preserves accuracy') are supported by controlled experiments: ablation of sharing ratio in Figure 2, same-budget GPU comparison for serving. The design isolates the treatment (shared prefill + cache-conditioned FT) from the control (standard FT + independent serving)." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": true, 136 "justification": "Results are presented in terms of specific models tested (LLaMA3.1-8B, Qwen3 family), specific tasks (math, coding, tool calling), and specific serving scenarios (4-agent ReAct/Reflexion). The conclusion hedges: 'These results suggest that shared-prefill execution is a promising foundation.'" 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper briefly mentions that cache-conditioned fine-tuning 'can be interpreted as a form of strict regularization' for occasional accuracy improvements, but does not substantively discuss alternative explanations for the serving improvements or confounding factors." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper measures accuracy on benchmarks and calls it accuracy, measures p95 latency and calls it latency, measures throughput and calls it throughput. There is no gap between what is measured and what is claimed." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": true, 153 "justification": "Specific model versions are stated: 'LLaMA3.1-8B' (referencing Grattafiori et al., 2024) and 'Qwen3-1.7B/8B/14B-Base' (referencing Yang et al., 2025). These are identifiable versions." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": false, 158 "justification": "The serving experiments use 'ReAct and Reflexion' prompting patterns but reference prior work (Kim et al., 2025; Woo et al., 2026) without reproducing the actual prompt text. No prompts are shown in the paper or appendix." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Appendix A reports: batch size 1, gradient accumulation 16, max sequence length 1024, 1 epoch, learning rate grid {1e-4, 1e-5, 2e-5, 5e-5, 2e-6, 5e-6}, AdamW with β1=0.9, β2=0.999, weight decay 0.1, warmup ratio 0.03." 164 }, 165 "scaffolding_described": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 3.3 describes the disaggregated inference workflow in detail: prefix-aware routing, the 3-step execution pipeline (shared/partial prefill → selective decode → cache handoff), and how the proxy maintains routing tables. Appendix B.1 provides implementation details." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Appendix A specifies datasets and training configurations. Appendix B.1 describes the serving evaluation setup: fixed input/output token lengths based on prior work statistics, session construction rules, and context growth protocol." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": false, 180 "justification": "There is no dedicated limitations section. The paper has no 'Limitations', 'Threats to Validity', or similar section. The conclusion is brief and does not substantively discuss limitations." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": false, 185 "justification": "No specific threats to validity are discussed. The paper does not consider potential confounds, generalization limits, or specific methodological weaknesses." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "No explicit scope boundaries are stated. The paper does not delineate what the results do NOT show or which settings are excluded from the claims." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": false, 197 "justification": "No raw experimental data, logs, or intermediate results are released. Only aggregated results in tables and figures are provided." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Training data sources are clearly identified (MetaMathQA-40K, EvolInstruct-Code-80K, xLAM-function-calling-60K) with citations. Evaluation benchmarks are standard and well-documented. Serving workload generation is described in Appendix B.1." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. All experiments use standard benchmarks and synthetic serving workloads." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The pipeline from datasets through fine-tuning to evaluation is documented in Appendix A and B.1. The serving pipeline describes how sessions are created, how requests flow through proxy → prefill → decode, and how context grows across turns." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": false, 219 "justification": "No funding or acknowledgments section is present in the paper. All authors are from NAVER Cloud but no funding source is disclosed." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "All authors are clearly listed as affiliated with NAVER Cloud in the author block at the top of the paper." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "All authors are NAVER Cloud employees. NAVER Cloud has a direct commercial interest in LLM serving efficiency — the proposed system could reduce their own serving costs. The funder is not independent of the outcome." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests or financial interests statement is present. NAVER Cloud employees proposing a system that benefits NAVER Cloud's serving infrastructure have an undisclosed potential conflict." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "No training data cutoff dates are stated for LLaMA 3.1 or Qwen3. The paper does not discuss when the base models' training data was collected." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "No discussion of whether GSM8K (2021), HumanEval (2021), or BFCL examples appeared in LLaMA 3.1 or Qwen3 training data. Since both methods use the same base model, contamination would affect both equally, but this is not discussed." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "GSM8K and HumanEval were published in 2021, well before the 2024-2025 training cutoffs of LLaMA 3.1 and Qwen3. The models could have seen these problems during pre-training. This contamination risk is not addressed." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study. All experiments are computational." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": true, 295 "justification": "Serving performance is extensively reported: p95 end-to-end latency, throughput (tok/s), and TTFT across varying session arrival rates (Figures 3, 5) and concurrency levels (Figures 4, 6)." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "Hardware is specified (8× A100-SXM4-80G for training; 8 GPUs for serving), but no total GPU hours, training time, or total compute cost is reported." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": false, 307 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All accuracy results in Tables 1-2 appear to be single-seed runs." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": false, 312 "justification": "The number of experimental runs is not stated for any result. It is unclear whether accuracy numbers are from a single run or averaged." 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": true, 317 "justification": "Appendix A reports: 'Learning rates were selected via a grid search over {1×10−4, 1×10−5, 2×10−5, 5×10−5, 2×10−6, 5×10−6} for each dataset.' Six configurations were searched per dataset." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": false, 322 "justification": "The learning rate grid search is described but the selection criterion is not stated. It is unclear whether the best configuration was selected on validation data or test data." 323 }, 324 "multiple_comparison_correction": { 325 "applies": false, 326 "answer": false, 327 "justification": "No statistical significance tests are performed, so correction for multiple comparisons is not applicable." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "The authors compare their PrefillShare implementation against their own Full-FT and disaggregated baseline implementations without acknowledging potential author-evaluation bias." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": true, 337 "justification": "The serving comparison explicitly uses the same total GPU budget: 'PrefillShare uses the same total GPU budget (4 prefill GPUs and 4 decode GPUs)' vs baseline's '4 isolated prefill/decode pairs (8 GPUs total).' For training, both Full-FT and PrefillShare use the same hardware." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether GSM8K, HumanEval, or BFCL adequately measure the capabilities claimed (mathematical reasoning, coding ability, tool calling). The benchmarks are used without questioning construct validity." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": true, 346 "answer": true, 347 "justification": "The serving experiments hold the agentic scaffold (ReAct/Reflexion pattern) constant between PrefillShare and baseline. The accuracy experiments use identical evaluation protocols. The only variable is the serving/fine-tuning method." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of temporal leakage. GSM8K (2021) and HumanEval (2021) were published years before LLaMA 3.1 and Qwen3 were trained, meaning solutions could be in the training data." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the evaluation setup leaks information. The paper does not analyze whether any aspect of the benchmarks provides unintended hints." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "No discussion of whether training data (MetaMathQA, EvolInstruct-Code) overlaps with or shares structure with evaluation benchmarks (GSM8K, HumanEval)." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, decontamination, or overlap analysis is performed." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "PrefillShare matches full fine-tuning accuracy within 1% across math, coding, and tool-calling benchmarks.", 376 "evidence": "Table 1 shows PrefillShare vs Full-FT on LLaMA3.1-8B: GSM8K 71.4 vs 71.3, GSM+ 49.3 vs 49.8, HumanEval 48.8 vs 48.2, HumanEval+ 45.1 vs 45.7, BFCL Simple 90.7 vs 90.0, BFCL Multiple 88.5 vs 88.0. Similar on Qwen3-8B-Base.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "PrefillShare scales across model sizes (1.7B to 14B) without sacrificing accuracy.", 381 "evidence": "Table 2 shows GSM8K/GSM+ accuracy for Qwen3-1.7B/8B/14B: PrefillShare within 1.2% of Full-FT across all sizes.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Naive KV cache sharing causes accuracy collapse at high sharing ratios.", 386 "evidence": "Figure 2 shows naive sharing on GSM8K drops from ~72% at 0% sharing to ~58% at 100% sharing, while PrefillShare maintains ~72% at 100% sharing.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "PrefillShare achieves up to 4.5× lower p95 latency in multi-model agent workloads.", 391 "evidence": "Figure 3 (Reflexion, highest arrival rate) shows PrefillShare p95 latency significantly lower than baseline. The 4.5× figure is at peak load where baseline degrades most.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "PrefillShare achieves up to 3.9× higher throughput in multi-model agent workloads.", 396 "evidence": "Figure 3 (Reflexion workload) shows throughput gap widening at higher arrival rates, with PrefillShare sustaining throughput where baseline saturates.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "PrefillShare maintains ~89% prefix cache hit ratio at high concurrency while baseline degrades to near zero.", 401 "evidence": "Figure 4 shows baseline prefix cache hit ratio peaks at ~60% around 40 sessions then drops sharply, while PrefillShare stays near 89% across concurrency levels.", 402 "supported": "moderate" 403 } 404 ], 405 "red_flags": [ 406 { 407 "flag": "No error bars or variance reporting", 408 "detail": "All accuracy and serving results are reported as single point estimates with no standard deviation, confidence intervals, or indication of run-to-run variance. It is unclear whether results are from single runs." 409 }, 410 { 411 "flag": "Company evaluating its own system", 412 "detail": "All authors are from NAVER Cloud, which has a commercial interest in efficient LLM serving. No conflict of interest statement is provided. The system directly benefits NAVER Cloud's serving infrastructure." 413 }, 414 { 415 "flag": "'Up to' claims may not be representative", 416 "detail": "The headline claims of 4.5× lower latency and 3.9× higher throughput are measured at the highest load points where the baseline degrades most severely. At low load, performance is comparable." 417 }, 418 { 419 "flag": "No limitations section", 420 "detail": "The paper has no dedicated limitations or threats-to-validity section, despite several potential concerns: restricted model size range (≤14B), limited workload patterns (2), and the vLLM-specific implementation that may not generalize to other serving systems." 421 }, 422 { 423 "flag": "Serving evaluation uses fixed token lengths", 424 "detail": "Appendix B.1 states 'we fix the input and output token lengths for each model invocation' based on prior work statistics. This synthetic setup may not reflect the variable-length distributions of real production workloads." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention", 430 "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"], 431 "year": 2023, 432 "relevance": "Foundational LLM serving system (vLLM) that PrefillShare extends; introduced PagedAttention for KV cache management." 433 }, 434 { 435 "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving", 436 "authors": ["Yinmin Zhong", "Shengyu Liu", "Junda Chen"], 437 "year": 2024, 438 "relevance": "Introduced disaggregated serving architecture that PrefillShare builds upon; separates prefill and decode onto dedicated GPUs." 439 }, 440 { 441 "title": "Splitwise: Efficient Generative LLM Inference Using Phase Splitting", 442 "authors": ["Pratyush Patel", "Esha Choukse", "Chaojie Zhang"], 443 "year": 2024, 444 "relevance": "Formalizes phase splitting for LLM serving; demonstrates improved goodput by isolating prefill and decode." 445 }, 446 { 447 "title": "ICaRus: Identical Cache Reuse for Efficient Multi-Model Inference", 448 "authors": ["Sunghyeon Woo", "Jaemin Kil", "Hoseung Kim"], 449 "year": 2026, 450 "arxiv_id": "2602.12029", 451 "relevance": "Direct predecessor to PrefillShare; decomposes transformer into frozen logical encoder and trainable decoder for KV reuse across fine-tuned models." 452 }, 453 { 454 "title": "DroidSpeak: KV Cache Sharing for Cross-LLM Communication and Multi-LLM Serving", 455 "authors": ["Yuhan Liu", "Yuyang Huang", "Jiayi Yao"], 456 "year": 2024, 457 "arxiv_id": "2411.02820", 458 "relevance": "Explores partial KV sharing across models by selectively reusing KV states from non-sensitive components." 459 }, 460 { 461 "title": "KVComm: Online Cross-Context KV-Cache Communication for Efficient LLM-Based Multi-Agent Systems", 462 "authors": ["Hanchen Ye", "Zijian Gao", "Mingyu Ma"], 463 "year": 2025, 464 "relevance": "Explores cross-model KV cache communication in multi-agent settings using anchor-pool-based alignment." 465 }, 466 { 467 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 468 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 469 "year": 2022, 470 "relevance": "Foundational agentic prompting pattern used as a workload in PrefillShare's serving evaluation." 471 }, 472 { 473 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 474 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"], 475 "year": 2023, 476 "relevance": "Agentic prompting pattern with self-reflection used as a workload in PrefillShare's serving evaluation." 477 }, 478 { 479 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations", 480 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 481 "year": 2024, 482 "relevance": "Multi-agent framework that exemplifies the multi-model workflow patterns PrefillShare targets for serving optimization." 483 }, 484 { 485 "title": "ToolOrchestra: Elevating Intelligence via Efficient Model and Tool Orchestration", 486 "authors": ["Hongjin Su", "Shuyuan Diao", "Xiaoyu Lu"], 487 "year": 2025, 488 "relevance": "Orchestrates multiple models and tools in agentic settings, reinforcing the need for efficient multi-model serving." 489 }, 490 { 491 "title": "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness", 492 "authors": ["Tri Dao", "Dan Fu", "Stefano Ermon"], 493 "year": 2022, 494 "relevance": "Foundational attention optimization that reduces memory bandwidth overhead in transformer inference." 495 }, 496 { 497 "title": "The Cost of Dynamic Reasoning: Demystifying AI Agents and Test-Time Scaling from an AI Infrastructure Perspective", 498 "authors": ["Joonghoon Kim", "Byeongho Shin", "Jaeyun Chung"], 499 "year": 2025, 500 "relevance": "Analyzes infrastructure costs of agentic AI workloads; provides the workload statistics used in PrefillShare's serving experiments." 501 } 502 ] 503 }