scan.json (26244B)
1 { 2 "paper": { 3 "title": "Beyond Quantity: Trajectory Diversity Scaling for Code Agents", 4 "authors": [ 5 "Guhong Chen", 6 "Chenghao Sun", 7 "Cheng Fu", 8 "Qiyao Wang", 9 "Zhihong Huang", 10 "Chaopeng Wei", 11 "Guangxu Chen", 12 "Feiteng Fang", 13 "Ahmadreza Argha", 14 "Bing Zhao", 15 "Xander Xu", 16 "Qi Han", 17 "Hamid Alinejad-Rokny", 18 "Qiang Qu", 19 "Binhua Li", 20 "Shiwen Ni", 21 "Min Yang", 22 "Hu Wei", 23 "Yongbin Li", 24 "Yu Ding" 25 ], 26 "year": 2026, 27 "venue": "arXiv", 28 "arxiv_id": "2602.03219" 29 }, 30 "checklist": { 31 "artifacts": { 32 "code_released": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper states 'We plan to open-source all components that can be publicly released, subject to licensing, privacy, and internal policy constraints' (footnote 1). This is a promise of future release with caveats, not a currently available artifact. No repository URL is provided." 36 }, 37 "data_released": { 38 "applies": true, 39 "answer": false, 40 "justification": "The synthesized trajectory dataset is not released. The paper mentions a plan to release but provides no download link or repository. The benchmarks used (BFCL, tau2-Bench, etc.) are public, but the paper's own synthesized data is not available." 41 }, 42 "environment_specified": { 43 "applies": true, 44 "answer": true, 45 "justification": "Appendix C.2 specifies training setup: Megatron-LM, 8x80GB GPUs per node, BF16 precision, Flash Attention v2, Gradient Checkpointing, global batch size 16, learning rate 1e-5 with cosine decay, 50 warmup steps, sequence length 65,536 tokens. This is sufficient to recreate the training environment." 46 }, 47 "reproduction_instructions": { 48 "applies": true, 49 "answer": false, 50 "justification": "While hyperparameters and system prompts are provided in appendices, there are no step-by-step reproduction instructions, no README with commands, and no scripts to replicate experiments. The paper describes the framework conceptually but does not provide actionable instructions a researcher could follow." 51 } 52 }, 53 "statistical_methodology": { 54 "confidence_intervals_or_error_bars": { 55 "applies": true, 56 "answer": false, 57 "justification": "Tables 1, 2, and 3 report only point estimates (e.g., '36.66%', '40.44%'). No confidence intervals, error bars, or ± notation are provided for any result." 58 }, 59 "significance_tests": { 60 "applies": true, 61 "answer": false, 62 "justification": "The paper makes numerous comparative claims (e.g., TDScaling 'outperformed' baselines, 'surpassing' prior SOTA) but no statistical significance tests are reported. Comparisons are based solely on comparing point estimates." 63 }, 64 "effect_sizes_reported": { 65 "applies": true, 66 "answer": true, 67 "justification": "Tables 1 and 2 report absolute improvements with baseline context, e.g., '+7.25' improvement from 29.41% to 36.66% on BFCL, '+4.00' overall from 30.99% to 34.99% on coding benchmarks. This provides enough context to assess magnitude." 68 }, 69 "sample_size_justified": { 70 "applies": true, 71 "answer": false, 72 "justification": "The choice of 500 and 5,000 training samples is described but not justified with any formal analysis. No power analysis or rationale for why these specific sizes were chosen beyond 'maximum common data availability across open-source projects' for 5,000." 73 }, 74 "variance_reported": { 75 "applies": true, 76 "answer": false, 77 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run numbers. There is no mention of averaging over multiple seeds or runs." 78 } 79 }, 80 "evaluation_design": { 81 "baselines_included": { 82 "applies": true, 83 "answer": true, 84 "justification": "Tables 1 and 2 compare against multiple baselines: proprietary models (GPT-5, GPT-4.1, Claude-Sonnet-4, Gemini-2.5-pro), open-source models (DeepSeek-V3.2, Qwen3-Coder-480B), and tool-learning methods (APIGen-MT, TOUCAN, Simia)." 85 }, 86 "baselines_contemporary": { 87 "applies": true, 88 "answer": true, 89 "justification": "Baselines include contemporary models and methods: GPT-5, Claude-Sonnet-4, DeepSeek-V3.2, and recent tool-learning methods APIGen-MT (2025), TOUCAN (2025), Simia (2025). These are recent and competitive." 90 }, 91 "ablation_study": { 92 "applies": true, 93 "answer": true, 94 "justification": "Table 3 presents a systematic ablation study removing individual components: w/o Cluster Sampling, w/o Global Evolution, w/o Code Tool, and w/o All. Each variant is evaluated across all benchmarks." 95 }, 96 "multiple_metrics": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper evaluates on multiple benchmarks across two dimensions: general tool-use (BFCL Multi-turn, tau2-Bench with three domains) and coding tasks (RebenchT with OH-p@1 and Qod-p@1, CodeCI avg@2, BIRD p@1)." 100 }, 101 "human_evaluation": { 102 "applies": true, 103 "answer": false, 104 "justification": "No human evaluation is included. All evaluation is automated via benchmark test suites. Given claims about 'generalization' and 'real-world' applicability, human evaluation of trajectory quality or agent behavior would be relevant." 105 }, 106 "held_out_test_set": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper uses established external benchmarks (BFCL, tau2-Bench, SWE-rebench, CodeCI/LiveCodeBench, BIRD) as test sets. These are separate from the training data, which consists of synthesized trajectories from MCP tool definitions." 110 }, 111 "per_category_breakdown": { 112 "applies": true, 113 "answer": true, 114 "justification": "Table 1 breaks down tau2-Bench results by domain (AIR, RET, TEL). Table 2 breaks down coding results by benchmark and metric (RebenchT OH-p@1 and Qod-p@1, CodeCI, BIRD). Table 3 provides per-benchmark ablation results." 115 }, 116 "failure_cases_discussed": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper discusses failure cases of baselines (negative transfer in APIGen-MT, Simia, TOUCAN on coding benchmarks). Figure 10 includes an annotated case study comparing baseline failures (misinterpreted errors, wrong parameters) with the model's correct behavior. Figure 4 discusses inverse scaling failures." 120 }, 121 "negative_results_reported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Table 3 ablation shows the w/o Code Tool variant scores higher on BFCL (37.56% vs. 36.66%) at the cost of coding performance, explicitly discussed as a trade-off. The paper also reports that baseline methods suffer negative transfer." 125 } 126 }, 127 "claims_and_evidence": { 128 "abstract_claims_supported": { 129 "applies": true, 130 "answer": true, 131 "justification": "The abstract claims TDScaling improves both tool-use generalization and coding proficiency, and that diversity scaling attains a higher performance ceiling than quantity scaling. Tables 1-2 support the first claim, and Figure 4 supports the second. The abstract is appropriately hedged." 132 }, 133 "causal_claims_justified": { 134 "applies": true, 135 "answer": true, 136 "justification": "The paper makes causal claims through ablation studies (Table 3), where removing individual components (Cluster Sampling, Global Evolution, Code Tool) in a controlled manner shows their contribution. This is adequate causal design for component-level claims. The ablation is systematically structured." 137 }, 138 "generalization_bounded": { 139 "applies": true, 140 "answer": false, 141 "justification": "The title 'Trajectory Diversity Scaling for Code Agents' and claims like 'establishing a resource-efficient paradigm for training robust code agents' are broad. Results are tested only on Qwen3-Coder-30B-A3B and Qwen3-30B-A3B with Qwen3-Max as teacher. Generalization to other model families, sizes, or teacher models is not tested but is implied by the broad framing." 142 }, 143 "alternative_explanations_discussed": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper does not discuss alternative explanations for why TDScaling outperforms baselines. For instance, the teacher model (Qwen3-Max) could be a confound since baselines may use different or weaker teachers. The quality filtering pipeline differences are not controlled for. No threats-to-validity section addresses these." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper uses 'Qwen3-Coder-30B-A3B-Instruct', 'Qwen3-30B-A3B-Instruct', and 'Qwen3-Max' without version numbers or snapshot dates. Baseline models are listed as 'GPT-5', 'GPT-4.1', 'Claude-Sonnet-4', 'Gemini-2.5-pro' without API versions or snapshot dates." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": true, 158 "justification": "Appendix D.1 (Figures 6-8) provides detailed system prompts for all agents: BlueprintAgent, UserAgent, AssistantAgent, ObservationAgent, and QualityAgent. These are actual prompt text, not just descriptions." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Appendix C.2 reports training hyperparameters: global batch size 16, learning rate 1e-5 with cosine decay, 50 warmup steps, sequence length 65,536 tokens, BF16 precision, Flash Attention v2. However, inference-time parameters (temperature, top-p) for the teacher model during synthesis are not stated." 164 }, 165 "scaffolding_described": { 166 "applies": true, 167 "answer": true, 168 "justification": "The multi-agent synthesis pipeline is described in detail in Section 3: BlueprintAgent, UserAgent, AssistantAgent, ObservationAgent, QualityAgent roles, Dynamic Schema Locking mechanism, Global Memory feedback loop, and adaptive evolution. Figure 2 provides an architectural overview." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 3.1 and Appendix C.1 document the data pipeline: 30,000 raw MCP tool definitions → encoding with Qwen-Embedding-0.6B → two-level K-Means clustering (Ndom=10, Ncls=5) → greedy selection of 6,944 Business Clusters → trajectory synthesis → quality filtering via QualityAgent." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": true, 180 "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing compute costs of the multi-agent pipeline and the restriction to text-based tool calls and Python execution (not multi-modal)." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": true, 185 "justification": "The Limitations section identifies specific threats: (1) per-trajectory API cost and latency are higher than lightweight baselines due to the multi-agent pipeline, and (2) the interaction scope is limited to text-based tool calls and Python execution, excluding graphical/multi-modal agents. These are specific to this study." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "While the Limitations section mentions text-only scope and compute costs, it does not explicitly state what the results do NOT show. For example, it does not explicitly bound claims to the tested model family (Qwen), the specific teacher model, or the specific benchmarks used. The broad claims in the conclusion ('establishes that synthetic data pipelines must be measured and directed') are not bounded." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": false, 197 "justification": "Neither the raw MCP tool definitions, the synthesized trajectories, nor the intermediate data are available for download. The paper promises future release but provides no links." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Section 3.1 and Appendix C describe the data collection: 30,000 raw MCP-compliant tool definitions were collected, organized into Business Clusters via embedding and clustering, then filtered via greedy selection to 6,944 clusters. Trajectories were synthesized using Qwen3-Max as teacher." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants were involved. All data is synthetically generated from MCP tool definitions and LLM-based trajectory synthesis." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The full pipeline from raw tool definitions to final training data is documented: tool collection (30,000) → embedding → K-Means clustering → greedy selection (6,944 clusters) → Blueprint-driven synthesis → quality filtering → final trajectories (500 or 5,000 samples). Appendix C provides additional detail." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": false, 219 "justification": "No funding source or acknowledgments section listing grants or sponsors is present in the paper." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Author affiliations are clearly listed: SIAT (CAS), Qoder, Alibaba Group, UNSW Sydney, and SUAT. The connection between authors and Qoder/Alibaba is visible." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "No funding is disclosed, making it impossible to assess independence. Authors from Qoder and Alibaba Group could have financial interest in demonstrating the value of their training framework, but this is not discussed." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests or financial interests statement is present in the paper. Authors are affiliated with Qoder (a company) and Alibaba Group, which could have financial interests in the outcomes." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "The paper evaluates fine-tuned models on benchmarks but does not state the training data cutoff for the base Qwen3-Coder model or the Qwen3-Max teacher model used for synthesis." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "No discussion of whether the base model (Qwen3-Coder) may have seen the benchmark data (BFCL, BIRD, etc.) during pre-training. The paper uses SWE-rebench which is designed for decontamination, but this is not discussed." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "Several benchmarks used (BFCL, BIRD) were published before the likely training cutoff of Qwen3-Coder. LiveCodeBench/CodeCI is designed for contamination-free evaluation (noted in the citation), but the paper does not discuss contamination risk for any benchmark." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study. All experiments use synthetic data and automated benchmarks." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved. The Ethical Considerations section explicitly states 'it did not involve human subjects, crowdsourcing, or personally identifiable information.'" 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants or experimental conditions involving human assignment." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants or evaluators requiring blinding." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants are involved in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No inference cost, latency, or tokens consumed are reported for either the synthesis pipeline or the evaluation runs. The Limitations section acknowledges 'per-trajectory API cost and latency are higher than lightweight baselines' but provides no numbers." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The paper mentions training on '8 × 80GB GPUs' per node but does not state how many nodes, total GPU hours, total API spend for synthesis with Qwen3-Max, or wall-clock time for training or evaluation." 301 } 302 } 303 }, 304 "claims": [ 305 { 306 "claim": "TDScaling with 500 samples enables Qwen3-Coder-30B-A3B to reach 36.66% on BFCL Multi-turn, exceeding the much larger Qwen3-Coder-480B-A35B-Instruct (35.91%).", 307 "evidence": "Table 1 shows TDScaling (500 Samples) at 36.66% vs. Qwen3-Coder-480B at 35.91% on BFCL Multi-turn.", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "TDScaling is the only method that achieves comprehensive improvements across all coding benchmarks without negative transfer.", 312 "evidence": "Table 2 shows TDScaling with positive gains across all metrics (+1.92 RebenchT OH, +7.72 Qod, +4.00 CodeCI, +2.35 BIRD), while all baselines show negative transfer on at least some metrics.", 313 "supported": "moderate" 314 }, 315 { 316 "claim": "Diversity scaling achieves a substantially higher performance ceiling than quantity scaling.", 317 "evidence": "Figure 4 shows TDScaling performance increasing from ~36.66% at 500 samples to 40.44% at 5,000 samples, while baselines exhibit inverse scaling or stagnation.", 318 "supported": "moderate" 319 }, 320 { 321 "claim": "The Code Tool acts as a regularizer against catastrophic forgetting of intrinsic coding capabilities.", 322 "evidence": "Table 3 ablation: w/o Code Tool scores 37.56% on BFCL but drops to 41.58% on BIRD (vs. 43.83% full model), supporting the regularization claim.", 323 "supported": "moderate" 324 }, 325 { 326 "claim": "TDScaling's synthesized data achieves higher Domain Entropy (4.25 vs. 2.15) and Reasoning Mode Entropy (8.97 vs. 5.42) compared to baseline synthesis.", 327 "evidence": "Figure 3 reports these entropy scores comparing TDScaling vs. baseline datasets.", 328 "supported": "moderate" 329 } 330 ], 331 "methodology_tags": [ 332 "benchmark-eval" 333 ], 334 "key_findings": "TDScaling proposes shifting synthetic data scaling for code agents from quantity to diversity, using Business Cluster sampling, adaptive evolution guided by entropy/complexity metrics, and a sandboxed Code Tool as regularizer. With only 500 training samples, TDScaling enables a 30B-parameter model to match or exceed a 480B-parameter model on tool-use benchmarks (BFCL). Unlike competing tool-learning methods that suffer negative transfer on coding tasks, TDScaling improves both tool-use generalization and inherent coding proficiency across five benchmarks.", 335 "red_flags": [ 336 { 337 "flag": "No statistical uncertainty quantification", 338 "detail": "All results are reported as single point estimates without confidence intervals, error bars, significance tests, or variance across runs. Given that fine-tuning and benchmark evaluation can have high variance, the reported differences (e.g., 36.66% vs. 35.91% on BFCL) could be within noise." 339 }, 340 { 341 "flag": "Potential conflict of interest", 342 "detail": "Authors from Qoder and Alibaba Group evaluate a training framework that could be commercially valuable. The base model (Qwen3-Coder) is from the Qwen family associated with Alibaba. No funding disclosure or competing interests statement is provided." 343 }, 344 { 345 "flag": "Unfair baseline comparison", 346 "detail": "Baselines (APIGen-MT, TOUCAN, Simia) are evaluated at 5,000 samples described as 'maximum common data availability,' but TDScaling uses a proprietary teacher (Qwen3-Max) and custom synthesis pipeline. The quality of the teacher model and synthesis process is a major confound not controlled for across methods." 347 }, 348 { 349 "flag": "No contamination analysis", 350 "detail": "Several benchmarks (BFCL, BIRD) predate the likely training cutoff of Qwen3-Coder. The paper does not discuss whether the base model may have already seen benchmark data during pre-training, which could inflate reported improvements." 351 }, 352 { 353 "flag": "Artifacts not released", 354 "detail": "Despite claims of establishing 'a resource-efficient paradigm,' neither the code, synthesized data, nor trained models are released. The promise of future release is hedged with 'subject to licensing, privacy, and internal policy constraints.'" 355 } 356 ], 357 "cited_papers": [ 358 { 359 "title": "Gorilla: Large language model connected with massive APIs", 360 "authors": ["Shishir G Patil", "Tianjun Zhang", "Xin Wang", "Joseph E Gonzalez"], 361 "year": 2024, 362 "relevance": "Defines the BFCL benchmark used for evaluation and represents prior work on connecting LLMs with APIs." 363 }, 364 { 365 "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs", 366 "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"], 367 "year": 2023, 368 "arxiv_id": "2307.16789", 369 "relevance": "Key prior work on scaling tool-use training data; represents the quantity-scaling paradigm that TDScaling critiques." 370 }, 371 { 372 "title": "APIGen-MT: Agentic pipeline for multi-turn data generation via simulated agent-human interplay", 373 "authors": ["Akshara Prabhakar", "Zuxin Liu", "Ming Zhu"], 374 "year": 2025, 375 "arxiv_id": "2504.03601", 376 "relevance": "A directly compared baseline for tool-use trajectory synthesis with Blueprint-driven mechanisms." 377 }, 378 { 379 "title": "TOUCAN: Synthesizing 1.5M tool-agentic data from real-world MCP environments", 380 "authors": ["Zhangchen Xu", "Adriana Meza Soria", "Shawn Tan"], 381 "year": 2025, 382 "arxiv_id": "2510.01179", 383 "relevance": "A directly compared baseline that generates tool-use data from real MCP environments at scale." 384 }, 385 { 386 "title": "Simia: Simulating environments with reasoning models for agent training", 387 "authors": ["Yuetai Li", "Huseyin A Inan", "Xiang Yue"], 388 "year": 2025, 389 "arxiv_id": "2511.01824", 390 "relevance": "A directly compared baseline that uses reasoning models to simulate environments for agent training." 391 }, 392 { 393 "title": "SWE-rebench: An automated pipeline for task collection and decontaminated evaluation of software engineering agents", 394 "authors": ["Ibragim Badertdinov", "Alexander Golubev"], 395 "year": 2025, 396 "arxiv_id": "2505.20411", 397 "relevance": "Benchmark used for evaluation; addresses decontamination in software engineering agent evaluation." 398 }, 399 { 400 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 401 "authors": ["Naman Jain", "King Han", "Alex Gu"], 402 "year": 2024, 403 "arxiv_id": "2403.07974", 404 "relevance": "Benchmark used for evaluation; designed for contamination-free code evaluation." 405 }, 406 { 407 "title": "Can LLM already serve as a database interface? A big bench for large-scale database grounded text-to-SQLs", 408 "authors": ["Jinyang Li", "Binyuan Hui", "Ge Qu"], 409 "year": 2023, 410 "relevance": "The BIRD benchmark used for text-to-SQL evaluation in this paper." 411 }, 412 { 413 "title": "ReAct: Synergizing reasoning and acting in language models", 414 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 415 "year": 2022, 416 "relevance": "Foundational work on combining reasoning and action in language models, which this paper builds upon for agentic code agents." 417 }, 418 { 419 "title": "Toolformer: Language models can teach themselves to use tools", 420 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"], 421 "year": 2023, 422 "relevance": "Seminal work on tool-augmented language models, foundational to the tool-learning paradigm this paper extends." 423 }, 424 { 425 "title": "DeepSeek-V3.2: Pushing the frontier of open large language models", 426 "authors": ["Aixin Liu"], 427 "year": 2025, 428 "arxiv_id": "2512.02556", 429 "relevance": "Contemporary open-source model that strengthened tool-use capability; compared as a baseline achieving top performance." 430 }, 431 { 432 "title": "A survey on code generation with LLM-based agents", 433 "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian"], 434 "year": 2025, 435 "arxiv_id": "2508.00083", 436 "relevance": "Recent survey of code generation agents that contextualizes tool integration in code agents." 437 } 438 ] 439 }