scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31379B)
      1 {
      2   "paper": {
      3     "title": "Large Language Models as Generalist Policies for Network Optimization",
      4     "authors": [
      5       "Duo Wu",
      6       "Linjia Kang",
      7       "Zhimin Wang",
      8       "Fangxin Wang",
      9       "Wei Zhang",
     10       "Xuefeng Tao",
     11       "Wei Yang",
     12       "Le Zhang",
     13       "Peng Cui",
     14       "Zhi Wang"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2512.11839",
     19     "doi": "10.48550/arXiv.2512.11839"
     20   },
     21   "scan_version": 2,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval", "case-study"],
     24   "key_findings": "Trailblazer, a framework grounding LLMs as generalist network policies via network alignment (NIOKA) and adaptive policy collaboration (APC), outperforms specialist policies on ABR (14.5-36.6% higher QoE) and CJS (6.8-41.3% lower JCT) in simulation, and reduces video stall rates by 0.76-24.45% versus Douyin's production policy VICC in a 3-week A/B test with 150K+ users. The paper identifies early saturation (performance plateaus beyond ~1B parameters) and selective invocation (only routing difficult cases to the LLM) as key design principles for deploying LLM-based network policies in latency-constrained systems.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Section 4.6 states 'we have released our codes showing our implementation, including dataset processing, Trailblazer implementation, training and inference recipes, and simulator integration' with a GitHub URL (https://github.com/duowuyms/Trailblazer)."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section 4.5 states 'The datasets can be accessed through our open-source github repo' and provides links to the public FCC and TPC-H datasets. However, the online Douyin experiment data is proprietary and not released."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed library version specifications are provided in the paper. The paper mentions model names but not the software environment needed to reproduce experiments."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Section 4.6 states the released code includes 'training and inference recipes, and simulator integration,' and Section 4.5 provides dataset access links. The GitHub repo claims to include implementation and recipes for reproduction."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Fig. 3a caption states 'Results are averaged over three random seeds, with the mean and standard deviation reported.' Fig. 3b shows box plots with distributions. However, the online A/B test results report only point estimates of relative reduction without uncertainty."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No statistical significance tests (p-values, t-tests, etc.) are reported anywhere in the paper. All comparisons between Trailblazer and baselines are based on comparing raw numbers without any formal testing."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper reports percentage improvements with baseline context throughout: '14.5%-36.6% higher QoE on ABR,' 'reducing JCT by 6.8%-41.3% on CJS,' and specific relative reductions in stall rates (0.92%, 1.28%, 0.76%) for the online experiments."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No justification is provided for the choice of 3 random seeds for simulation, the number of bandwidth traces (485), the 150,000+ user sample for A/B testing, or the 3-week test duration. No power analysis is discussed."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Fig. 3a reports mean and standard deviation across three random seeds. Fig. 3b shows box plots with full distributions. However, online A/B test results report only relative reductions without variance."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple baselines per task: GENET (learning-based), BBA, MPC (rule-based) for ABR; Decima (learning-based), FIFO, Fair (rule-based) for CJS; VICC (production specialist policy) for CC."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "GENET (2022) and Decima (2019) are state-of-the-art learning-based policies for their respective tasks. VICC is the current production system at Douyin. BBA (2014), MPC (2015), FIFO, and Fair are standard rule-based baselines appropriately included."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Fig. 4a ablates pretrained knowledge (reinitialize from scratch) and domain knowledge (freeze backbone, only train encoder/decoder). Fig. 5 ablates the scheduler component. Fig. 4c studies model scale effects."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "QoE for ABR, JCT for CJS, MAPE and request processing delay for CC validation, video stall rate (at 100ms, 200ms, 500ms thresholds) for online experiments, plus per-OS and per-distance breakdowns."
     95       },
     96       "human_evaluation": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "Human evaluation is not relevant to evaluating network control policies. The system outputs are bandwidth estimates and sending rates — numerical control decisions, not content for human judgment. QoE and stall rates are automated system metrics."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Clear train/val/test splits: ABR uses 235 train, 150 validation, 100 test FCC traces plus separate OOD test environments. CC uses 95% train, 5% test split. CJS uses separate training and test environments."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results broken down by task (ABR, CJS, CC), by environment (in-distribution and three OOD settings), by OS platform (Android, iOS, other), by geographical distance from server, and by stall duration threshold."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "No error analysis or discussion of specific failure modes of Trailblazer. The paper only shows scenarios where Trailblazer succeeds. No discussion of when or where the approach breaks down."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "OPT-0.35B 'performs worse than baselines' due to insufficient pretrained knowledge (Fig. 4c). The frozen-backbone variant 'fails to generalize across different tasks' (Fig. 4a). Trailblazer without scheduler shows severe degradation under high load (Fig. 5b). Android performance is only 'on par' with VICC (Fig. 6b)."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims about cross-task and cross-environment generalization are supported by Fig. 3. The claim about outperforming VICC in online A/B tests is supported by Fig. 6. The claim about 150,000+ users is stated in Section 2.3.3."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The main causal claims are supported by controlled ablations: removing pretrained weights degrades performance (Fig. 4a), freezing the backbone degrades generalization (Fig. 4a), removing the scheduler degrades efficiency (Fig. 5). These are standard single-variable manipulation ablation designs."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The title claims 'Generalist Policies for Network Optimization' broadly, but only 3 networking tasks are tested (ABR, CJS, CC), all related to video streaming/data center infrastructure. The abstract claims 'generalist network policies that can generalize across diverse tasks and environments' which significantly overstates the breadth of evidence."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper attributes success to 'pretrained knowledge encoding transferable networking principles' without considering alternative explanations such as general sequence modeling ability, the effect of model capacity, or whether the fine-tuning procedure alone is sufficient with any large architecture."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper measures QoE, JCT, MAPE, and stall rates, which are standard networking metrics that directly measure what is claimed. QoE measures video quality experience, JCT measures job completion efficiency, and stall rates measure playback smoothness. No significant proxy gap exists."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Specific model versions are named: 'Llama2-7B' (ref [31]), 'Qwen2.5-0.5B' (ref [21]), 'OPT' family with specific sizes (0.35B, 1.3B, etc.), 'Mistral' 7B (ref [64]), 'LLaVa' 7B (ref [65]). These are specific open-source model releases."
    154       },
    155       "prompts_provided": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "The paper does not use text prompting. The LLM is fine-tuned with custom network state encoders and action decoders that process non-textual numerical data. No prompts are involved in the system."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "Some hyperparameters are reported: context windows w=10 (ABR), w=20 (CJS), QoE coefficients λ1=4.3 and λ2=1, scheduler thresholds α1=50ms, α2=0.05, α3=0.95, batch size=64. However, core training hyperparameters (learning rate, optimizer, number of epochs, weight decay) are not reported."
    164       },
    165       "scaffolding_described": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No agentic scaffolding is used. The system is a fine-tuned LLM with custom encoder/decoder modules for processing numerical network data, not an agent with tools or multi-step reasoning."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 4.2 describes experience dataset construction for each task: how baseline policies were run in simulated environments to collect state-action-reward tuples, data split ratios (95/5 for CC, 235/150/100 traces for ABR), and the 30,000+ session CC dataset construction with device/media/network configurations."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No dedicated limitations section. The Discussion (Section 3) briefly mentions 'the internal decision logic of LLMs remains difficult to interpret' as future work, but this is a single sentence, not a substantive limitations discussion."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No threats to validity are discussed. The paper does not mention potential threats such as the limited number of tasks tested, the reliance on specific simulators, or the non-generalizability of Douyin-specific findings."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No explicit scope boundaries are stated. The paper presents its findings as establishing 'LLMs as the foundation of generalist network policies' without explicitly stating what the results do NOT show or what settings are excluded."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "Simulation data uses publicly available FCC and TPC-H datasets, and some data is accessible via GitHub. However, the online Douyin experiment data is proprietary and cannot be independently verified — only relative reductions are reported due to 'compliance with Douyin's data security policy.'"
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 4.2 describes data collection in detail: FCC bandwidth traces (485 traces, split 235/150/100), TPC-H benchmark jobs, and CC experience dataset construction using 6 mobile devices, 9 media content types, and HoloWAN network emulator across diverse conditions."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "For the online A/B test with 150,000+ users, the paper states requests were 'randomly routed to these servers' but does not describe how the specific user sample was drawn from Douyin's broader user base or whether the sample is representative."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The simulation data pipeline is documented: environment simulation → baseline policy interaction → state-action-reward collection → train/test split. Section 4.2.3 describes CC data collection: real sessions with mobile devices → network conditions via HoloWAN → policy-driven data collection → 95/5 train/test split."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Author affiliations are clearly listed, including Tsinghua University, CUHK Shenzhen, and ByteDance. The footnote specifies which ByteDance authors are in which cities."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Multiple co-authors (Wei Zhang, Xuefeng Tao, Wei Yang, Le Zhang) are from ByteDance, whose product Douyin is the deployment platform. ByteDance has a direct interest in validating that their production system can be improved by LLM-based approaches. No funding is formally disclosed, making independence impossible to assess."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests statement or financial interest disclosures are present in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No training data cutoff dates are stated for any of the pre-trained models used (Llama2, Qwen2.5, OPT, Mistral, LLaVa). The paper claims pre-trained knowledge contributes to performance but does not state when training data was collected."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No discussion of whether pre-training data might overlap with test data. The paper claims pretrained knowledge encodes networking principles but does not analyze whether the LLMs were trained on networking literature that includes the specific benchmark problems or solutions."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No discussion of benchmark contamination. While the benchmarks (FCC traces, TPC-H) are numerical/structural data unlikely to appear verbatim in LLM pre-training, the paper's explicit claim that pretrained knowledge encodes networking principles makes contamination discussion relevant."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "The online A/B test is a standard production system evaluation, not a human subjects study. Users are normal Douyin users whose traffic was routed through different backend systems within normal service parameters."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human subjects study conducted. The A/B test operates within Douyin's standard service agreement with users. Section 4.4 notes compliance with the user agreement and no PII collection."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human subjects study. The paper reports client statistics (OS distribution, geographic distribution) as system metrics, not participant demographics."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human subjects study. Users were not recruited — they are normal platform users whose requests were randomly routed."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human subjects study. Request routing is a standard A/B test mechanism within production infrastructure."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human subjects study. Users were not aware of condition assignment as this was a backend system change."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human subjects study. The paper reports total user counts and playback time but not as participant attrition."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Section 4.1.3 reports: 'the LLM consumes approximately 4.5 GB GPU memory and takes about 30 ms per inference' with batch size 64, achieving 'average inference latency of 37.1 ms.' Fig. 5b reports processing delay under various loads."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No total computational budget is stated. Training GPU hours, total training time, and hardware specifications for training are not reported. Only inference costs are described."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Fig. 3a caption states 'Results are averaged over three random seeds, with the mean and standard deviation reported.'"
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Three random seeds are explicitly stated for simulation experiments (Fig. 3a). Online A/B tests ran for 3 weeks with 150,000+ users."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search budget is reported. Values like w=10, w=20, α1=50ms are stated as 'empirically set' without describing the search process or number of configurations tried."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Threshold values (α1=50ms, α2=0.05, α3=0.95) and context windows (w=10, w=20) are described as 'empirically set' without justification for the selection or description of the selection criterion."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "Multiple comparisons are made across tasks, environments, OS platforms, and distance categories without any significance tests or multiple comparison corrections."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors evaluate their own system Trailblazer against baselines without acknowledging self-comparison bias. Section 4.4 notes simulation data is 'reused from our prior work [53],' meaning the baseline implementations may also come from the authors' prior work."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Extended Data Fig. 2 shows model scale vs. performance and runtime for CC, but there is no fair comparison of compute budgets between Trailblazer (using a 7B LLM or 0.5B LLM) and the lightweight rule-based or small RL baselines. The compute gap is enormous and not addressed."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The paper uses FCC traces, TPC-H jobs, and Douyin's CC environment without discussing whether these benchmarks adequately measure the claimed 'generalist network policy' capability or whether performance on 3 tasks generalizes to networking broadly."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "No scaffolding confound exists. The system (NIOKA + APC) IS the thing being tested, not a confounding scaffold applied to different models."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of temporal leakage. The paper does not address whether the LLMs' pre-training data includes information about the benchmarks or their solutions."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of feature leakage. The experience dataset is collected from baseline policies, and whether the training data provides information not available at inference time is not analyzed."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of non-independence between training and test data. FCC bandwidth traces are randomly split but potential structural similarities between training and test traces are not addressed."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No leakage detection or prevention method is used or discussed."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Trailblazer achieves 14.5%-36.6% higher QoE on ABR and reduces JCT by 6.8%-41.3% on CJS compared to specialist baselines, demonstrating cross-task generalization with a single LLM.",
    376       "evidence": "Fig. 3a shows mean and standard deviation across 3 random seeds for both tasks, comparing against GENET, BBA, MPC (ABR) and Decima, FIFO, Fair (CJS).",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Trailblazer demonstrates stronger cross-environment generalization, improving mean QoE by 3.9%-24.8% over rule-based and 1.5%-44.3% over learning-based policies in OOD environments.",
    381       "evidence": "Fig. 3b shows performance distributions across three OOD environments for each task, with box plots and mean values.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Pretrained knowledge of LLMs encodes transferable and abstract network knowledge that serves as a critical prerequisite for generalist network policies.",
    386       "evidence": "Fig. 4a shows that reinitializing the LLM and training from scratch leads to significant performance degradation on both ABR and CJS, compared to using pretrained weights.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "LLMs exhibit early saturation in network optimization — performance saturates rapidly beyond ~1B parameters.",
    391       "evidence": "Fig. 4c shows OPT variants from 0.35B to 6.7B on ABR, with performance plateauing after 1B. Extended Data Fig. 2 shows similar saturation for Qwen2.5 on CC.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Trailblazer reduces video stall rates by 0.92% (100ms), 1.28% (200ms), and 0.76% (500ms) relative to VICC in production deployment on Douyin.",
    396       "evidence": "Fig. 6a from 3-week online A/B tests with 150,000+ users across 100+ cities. Only relative reductions reported due to data security policy.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Selective invocation via the scheduler reduces average processing delay from 345ms to 61ms while incurring only 2.66% higher MAPE.",
    401       "evidence": "Fig. 5b shows processing delay comparison at p=20% and peak load of 2,000 requests between Trailblazer with and without scheduler.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Trailblazer significantly outperforms VICC by 24.45% on HarmonyOS platforms, demonstrating rapid adaptation to emerging OS without manual tuning.",
    406       "evidence": "Fig. 6b shows relative stall rate reductions across OS categories. However, the HarmonyOS sample size is not specified (Fig. 6d shows 'Other' as a small proportion).",
    407       "supported": "weak"
    408     }
    409   ],
    410   "red_flags": [
    411     {
    412       "flag": "Company evaluating its own product",
    413       "detail": "Four co-authors are from ByteDance, whose product Douyin is the deployment platform. They compare Trailblazer against VICC, ByteDance's own production policy. No independent evaluation is conducted, and no competing interests are declared."
    414     },
    415     {
    416       "flag": "Reused experimental data",
    417       "detail": "Section 4.4 acknowledges 'The experimental data in Section 2.2 is reused from our prior work [53].' The simulation experiments (ABR and CJS) are not new data collection — only the online CC experiments are novel."
    418     },
    419     {
    420       "flag": "Only relative improvements reported for key experiments",
    421       "detail": "Due to Douyin's data security policy, all online A/B test results are reported as relative reductions over VICC without absolute stall rate values. This makes independent verification impossible and prevents assessing practical significance."
    422     },
    423     {
    424       "flag": "Overclaiming generalization from narrow evidence",
    425       "detail": "The paper claims LLMs as 'the foundation for generalist network policies' and 'a new generalist-driven paradigm' based on 3 networking tasks (ABR, CJS, CC), all related to video streaming and data center infrastructure. The title and abstract significantly overstate the breadth of evidence."
    426     },
    427     {
    428       "flag": "No statistical significance tests",
    429       "detail": "Despite making numerous comparative claims across tasks, environments, and deployment conditions, no statistical significance tests are performed anywhere in the paper. All claims of superiority are based on raw number comparisons."
    430     },
    431     {
    432       "flag": "No limitations section",
    433       "detail": "The paper lacks a dedicated limitations section. The only limitation mentioned is a brief sentence about interpretability in the Discussion section. Significant limitations — narrow task coverage, simulator fidelity, compute cost, non-transferability of Douyin-specific findings — are not discussed."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "NetLLM: Adapting large language models for networking",
    439       "authors": ["Duo Wu", "Xiyang Wang", "Yanwei Qiao", "Zhi Wang", "Junchen Jiang", "Shuguang Cui", "Fangxin Wang"],
    440       "year": 2024,
    441       "relevance": "Prior work by the same authors on adapting LLMs for networking tasks, establishing the foundation for Trailblazer."
    442     },
    443     {
    444       "title": "The llama 3 herd of models",
    445       "authors": ["Aaron Grattafiori"],
    446       "year": 2024,
    447       "arxiv_id": "2407.21783",
    448       "relevance": "Foundation LLM family used in the experiments, relevant to understanding LLM capabilities for non-NLP tasks."
    449     },
    450     {
    451       "title": "Qwen2.5 technical report",
    452       "authors": ["Qwen Team"],
    453       "year": 2024,
    454       "arxiv_id": "2412.15115",
    455       "relevance": "Foundation LLM used for real-world Douyin deployment (Qwen2.5-0.5B), demonstrating small LLM effectiveness in production."
    456     },
    457     {
    458       "title": "Emergent abilities of large language models",
    459       "authors": ["Jason Wei"],
    460       "year": 2022,
    461       "arxiv_id": "2206.07682",
    462       "relevance": "Foundational paper on LLM emergent abilities cited to justify using LLMs for network optimization generalization."
    463     },
    464     {
    465       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    466       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    467       "year": 2024,
    468       "relevance": "Cost-efficient LLM usage strategies, relevant to the paper's theme of balancing LLM capability with computational efficiency."
    469     },
    470     {
    471       "title": "Hybrid LLM: Cost-efficient and quality-aware query routing",
    472       "authors": ["Dujian Ding"],
    473       "year": 2024,
    474       "relevance": "LLM query routing between small and large models, directly related to Trailblazer's adaptive policy collaboration mechanism."
    475     },
    476     {
    477       "title": "RouteLLM: Learning to route LLMs from preference data",
    478       "authors": ["Isaac Ong"],
    479       "year": 2025,
    480       "relevance": "Dynamic routing between LLMs based on query difficulty, conceptually similar to Trailblazer's scheduler for routing network requests."
    481     },
    482     {
    483       "title": "LLM-blender: Ensembling large language models with pairwise ranking and generative fusion",
    484       "authors": ["Dongfu Jiang"],
    485       "year": 2023,
    486       "relevance": "LLM ensembling and collaboration strategies relevant to the paper's multi-policy collaboration approach."
    487     },
    488     {
    489       "title": "Decision transformer: Reinforcement learning via sequence modeling",
    490       "authors": ["Lili Chen"],
    491       "year": 2021,
    492       "relevance": "Core algorithmic foundation for Trailblazer's offline reinforcement fine-tuning, reformulating RL as sequence modeling for LLMs."
    493     },
    494     {
    495       "title": "Neural adaptive video streaming with Pensieve",
    496       "authors": ["Hongzi Mao", "Ravi Netravali", "Mohammad Alizadeh"],
    497       "year": 2017,
    498       "relevance": "Pioneering deep learning approach for adaptive bitrate streaming, a key baseline domain for LLM-based network optimization."
    499     },
    500     {
    501       "title": "Llama 2: Open foundation and fine-tuned chat models",
    502       "authors": ["Hugo Touvron"],
    503       "year": 2023,
    504       "arxiv_id": "2307.09288",
    505       "relevance": "Primary foundation LLM used in Trailblazer's simulation experiments, relevant to understanding LLM adaptation for non-NLP tasks."
    506     }
    507   ]
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs