scan.json (27176B)
1 { 2 "paper": { 3 "title": "Chasing Progress, Not Perfection: Revisiting Strategies for End-to-End LLM Plan Generation", 4 "authors": ["Sukai Huang", "Nir Lipovetzky", "Trevor Cohn"], 5 "year": 2024, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2412.10675" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "An anonymous code repository is provided: https://anonymous.4open.science/r/official-misconcept-lm-plan-gen-D34B (mentioned in Section 4.1 and Appendix). While anonymous for review, it is a working URL." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper states 'The dataset will be released via Huggingface Hub' (Section 3.1 footnote and Appendix C), indicating a promise of future release. This counts as NO per evaluation criteria." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": true, 24 "justification": "Appendix B provides detailed implementation details including hardware (4 Nvidia A100 GPUs), the specific model (Qwen2-7B-Instruct), deepspeed configuration (zero 3), bf16 precision, and Appendix B.1 lists a comprehensive hyperparameter table with learning rates, batch sizes, LoRA configuration, etc." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While hyperparameters are listed in a table and code is provided via an anonymous repository, there are no explicit step-by-step reproduction instructions (e.g., a README with commands). The paper describes the methodology but does not provide a clear recipe to reproduce from scratch." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results are reported as point estimates (e.g., '98.5% validity rate') across all tables (Tables 1-5) with no confidence intervals, error bars, or uncertainty quantification." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "Table 2 caption states 'Improvements of statistical significance are highlighted in green, while significant declines are highlighted in red,' indicating statistical significance testing was performed, though the specific test used is not named." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports improvements with baseline context throughout, e.g., 'RL boosted the validity rate on the long test set from 34.8% to 41.5% (a 6.7% increase) and the executability rate from 42.3% to 53.6% (9.0%)' (Section 4.7). These provide enough context to assess magnitude." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The training set is 4000 instances per domain and the test sets are 200 instances per domain, but no justification is given for why these specific sizes were chosen. The paper notes that 4000 is 5.7% of PlanGPT's data but does not justify why this amount is sufficient statistically." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers with no indication of multiple runs or seeds for the main experiments." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper includes the vanilla fine-tuned model (no strategies) as a baseline (Table 2 row 1), and compares multiple strategies against it. The SFT baseline is also compared against RL using the same data (Figure 6, Section 4.7)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The strategies compared (CoT from Wei et al. 2022, self-correction from Kumar et al. 2024 and Ye et al. 2024, RL from Liu et al. 2024, permutation from Allen-Zhu and Li 2023) are contemporary and represent the state of relevant reasoning-enhancement approaches. The paper explicitly positions against PlanGPT (Rossetti et al. 2024)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 2 is an extensive ablation study with 11 rows showing combinations of permutation, Goal CoT, State CoT, self-correction, and RL strategies, allowing assessment of each component's contribution." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper uses validity rate and executability rate as two primary metrics, plus introduces goal satisfiability rate (Appendix G) and pass@k (Appendix F). The introduction of executability as a complement to validity is a key contribution." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This paper evaluates automated planning using formal verification (VAL validator). Plan correctness is deterministically verifiable, making human evaluation irrelevant to the claims." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses distinct test sets explicitly separated from training data: 'In-Distrib' (same distribution, different instances), 'Long' (OOD plan lengths 17-32), 'Unseen' (novel domains), and 'Obfuscated' (obfuscated vocabulary). For RL, 10% of the 'long' test set was used for training and 90% for evaluation (Section 4.7)." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 1 provides per-domain breakdowns across all 8 training domains plus 2 unseen and 2 obfuscated domains. Table 2 provides breakdowns by test set category (In-Distrib, Long, Unseen, Obfuscated)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Extensive failure analysis is provided: Section 4.1 discusses failures on OOD and obfuscated sets, Section 4.3 analyzes Goal CoT failures, Section 4.4 discusses self-correction failures with qualitative analysis, and Appendix D provides a detailed failure case study with visualizations." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Multiple negative results are reported: Goal CoT hinders performance (Section 4.3), self-correction fails to improve validity rates (Section 4.4), fine-tuning fails on OOD scenarios (Section 4.1), and the obfuscated test set shows 0% across all strategies. The paper is structured around understanding what does and does not work." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims are supported: (1) fine-tuning does not lead to robust planning (Table 1 OOD results), (2) strategies like CoT enhance executability (Table 2), (3) RL with LCCS is the most effective strategy (Table 2 rows 10-11). All claims match the experimental results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims about strategy effectiveness through controlled ablation studies (Table 2), where each row changes one variable at a time. The RL comparison uses the same training data with SFT vs RL to control for data (Figure 6, Section 4.7). The plan continuation experiment (Section 4.6) tests the specific hypothesis about CoT's distribution sensitivity." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper explicitly bounds its scope to 'end-to-end plan generation' paradigm (Section 2, Scope), excluding LLM-Modulo, Thought of Search, and hybrid models like AlphaMath. The Discussion section (Section 5) restates the paradigm limitation. Results are specific to Qwen2-7B on the extended PlanBench dataset." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper discusses alternative explanations for several findings: the CHILDSNACK success might reflect task structure rather than planning ability (Section 4.1), State CoT's improvement may be limited to in-distribution lengths (Section 4.5, verified in Section 4.6), and the RL improvement is verified not to come from additional training data via SFT control (Section 4.7, Figure 6)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper specifies 'QWEN2-7B-INSTRUCT' (Section 4, also listed in Appendix B.1 hyperparameter table as 'Qwen2-7B-Instruct'), which is a specific model version and size." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Full prompt examples are provided: Figure 2 shows the PlanBench query format, Figures 3-5 show the augmented formats for each strategy, Figure 9 shows a complete training query example, and Figure 10 shows a complete response with CoT and self-correction." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix B.1 provides a comprehensive hyperparameter table covering data generation, fine-tuning (learning rate 1.0e-5, cosine scheduler, epochs, batch sizes, deepspeed config), RL (LoRA rank 128, alpha 256, learning rate 3.0e-6, episodes 500000), and evaluation (top_p 0.93, top_k 50, k values 1/3/5)." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. The approach is end-to-end plan generation via fine-tuned LLM with standard next-token prediction at inference time." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.1 describes how the extended PlanBench dataset was constructed from IPC benchmarks, with specific train/test sizes (4000 train per domain, 200 test per domain), plan length ranges (3-16 for training, 17-32 for long test), and the serialization format from PDDL to natural language is described with examples." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 5 (Discussion) serves as a combined discussion and limitations section, discussing the paradigm limitations of end-to-end plan generation. Additionally, the LCCS reward's limitation (reliance on a single reference plan) is explicitly acknowledged in Section 3.2." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Specific threats are discussed throughout: the LCCS reward has 'an inherent bias because it relies on a single reference plan' (Section 3.2), the model was trained on only 8 domains which limits generalization claims, and the RL training used 10% of the long test set (Section 4.7). The paper also discusses potential data contamination risk and why Qwen2 was chosen to reduce it (Section 4)." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The Scope subsection in Section 2 explicitly states what is excluded: LLM-Modulo framework, Thought of Search, hybrid models like AlphaMath with MCTS. The paper limits itself to 'paradigms that utilize basic next-token prediction during inference.' Section 5 reiterates: 'We have limited our scope to the end-to-end plan generation paradigm.'" 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "The dataset is promised for future release ('will be released via Huggingface Hub once the review process is completed') but is not yet available. Raw experimental outputs (generated plans, per-instance results) are not provided." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 3.1 describes how the PlanBench dataset was extended: new domains were added from IPC benchmarks, longer-plan problems were generated with plan lengths 17-32, and the overall dataset structure is detailed (4000 train instances/domain, 200 test instances/domain across 4 test categories)." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. The data source is the PlanBench benchmark and IPC planning benchmarks, which are standard resources." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The data pipeline is documented: PDDL domains from IPC benchmarks are serialized to natural language via PlanBench templates (Figure 2), then augmented with various strategies (permutation Figure 3, CoT Figure 4, self-correction Figure 5). Plan validation uses the VAL tool. The train/test split sizes and plan length distributions are specified." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No acknowledgments or funding section is present in the paper. No mention of grants or funding sources." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: all three authors are from the School of Computing and Information Systems, The University of Melbourne. Trevor Cohn is noted as 'Now at Google DeepMind.'" 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed, so independence cannot be assessed. Trevor Cohn's affiliation with Google DeepMind is noted, which could be relevant given the paper evaluates LLM capabilities, but no explicit conflict discussion is provided." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper does not state the exact training data cutoff date for Qwen2-7B-Instruct. It notes the model 'was pre-trained on general text but not on code' and 'was trained on a smaller dataset' than LLaMA 3 to 'reduce the risk of having exposed to PlanBench' (Section 4), but no specific cutoff date is provided." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section 4 discusses this: 'QWEN2's architecture shares significant similarities with LLAMA 3, but was trained on a smaller dataset. It thus reduce the risk of having exposed to PlanBench or related data during pre-training.' The paper also uses obfuscated domains and OOD tests to mitigate this concern." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper addresses contamination risk by: (1) choosing Qwen2 over larger models to reduce exposure risk, (2) using obfuscated domain names to test whether the model relies on memorized concepts, (3) using OOD test sets with longer plan lengths not seen during fine-tuning, and (4) limiting training data to 5.7% of PlanGPT's amount." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study. It is a benchmark evaluation of an LLM fine-tuned for planning." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, latency, or tokens consumed per example are reported despite the approach requiring fine-tuning and multiple strategy evaluations across thousands of instances." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "While '4 Nvidia A100 GPUs' are mentioned in Appendix B, no total GPU hours, training time, or total computational budget is stated. The RL training mentions '500000 total episodes' but not wall-clock time." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Fine-tuning LLMs on planning datasets does not lead to robust planning skills, as shown by poor performance on out-of-distribution test sets.", 286 "evidence": "Table 1 shows validity rate drops from 98.5% to 13.5% in Blocksworld when moving from in-distribution to longer plans. Unseen domains show 0% validity and obfuscated domains show 0% validity and executability (Section 4.1).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Strategies like chain-of-thought enhance plan executability even when they do not improve validity rates.", 291 "evidence": "Table 2 shows State CoT (row 4) achieves 100% executability on unseen domains while validity remains 0%. Permutation augmentation raises unseen executability from 20.1% to 75.5% (row 2) with no validity improvement (Section 4.2).", 292 "supported": "strong" 293 }, 294 { 295 "claim": "RL with the proposed LCCS reward is the most effective strategy, improving validity by 6.7% and executability by 9.0% on longer problems.", 296 "evidence": "Table 2 row 10 shows validity improving from 34.8% to 41.5% and executability from 42.3% to 53.6% on the long test set. RL also achieves 12.5% validity on the unseen test set where all other strategies achieve 0% (Section 4.7).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Goal CoT hinders planning performance due to a complexity paradox and overfitting to training plan lengths.", 301 "evidence": "Table 2 rows 3, 5, 7, 9 show Goal CoT consistently reduces performance. Section 4.3 attributes this to the rigidity of fixing step counts before planning and the model's bias toward in-distribution length estimates.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "LLMs can recognize planning mistakes with high precision (90.5%) and recall (99.2%) but fail to correct them.", 306 "evidence": "Table 3 shows probing test results with precision 87.5-90.5% and recall 97.4-99.2%. Despite high detection, self-correction does not improve validity rates (Table 2 rows 6-9). Appendix D provides qualitative failure analysis (Section 4.4).", 307 "supported": "strong" 308 }, 309 { 310 "claim": "State CoT's effectiveness is limited to in-distribution plan lengths.", 311 "evidence": "Table 2 row 4 shows State CoT achieves 100% executability on unseen test set (same length distribution as training) but only 43.0% on long test set. The plan continuation experiment (Section 4.6, Figure 7) confirms CoT works within familiar length ranges.", 312 "supported": "strong" 313 } 314 ], 315 "methodology_tags": ["benchmark-eval"], 316 "key_findings": "Fine-tuning LLMs on planning datasets achieves high in-distribution performance but fails drastically on out-of-distribution scenarios, with validity dropping from 98.5% to 13.5% on longer problems and 0% on unseen/obfuscated domains. Various reasoning-enhancement strategies (permutation, CoT, self-correction) improve plan executability but not validity, revealing that progress toward better plans occurs even without achieving final correctness. Reinforcement learning with a novel Longest Contiguous Common Subsequence (LCCS) reward emerged as the most effective strategy, improving both validity (+6.7%) and executability (+9.0%) on longer problems and being the only strategy to achieve non-zero validity on unseen domains (12.5%).", 317 "red_flags": [ 318 { 319 "flag": "No variance or error bars reported", 320 "detail": "All results are reported as point estimates with no standard deviations across runs. Given that both fine-tuning and RL involve stochastic optimization, single-run results may not be stable. The statistical significance testing mentioned in Table 2 caption is not described in detail." 321 }, 322 { 323 "flag": "RL evaluated on same distribution as training subset", 324 "detail": "RL was trained on 10% of the 'long' test set and evaluated on the remaining 90%. While the SFT control uses the same data split, this means RL results on the 'long' test set are not fully independent of the RL training distribution, and the paper does not discuss how the 10% was selected." 325 }, 326 { 327 "flag": "Anonymous repository during review", 328 "detail": "The code repository is an anonymous review link (anonymous.4open.science). The dataset is promised for future release on HuggingFace Hub. Neither may remain accessible post-review." 329 } 330 ], 331 "cited_papers": [ 332 { 333 "title": "Position: LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks", 334 "authors": ["Subbarao Kambhampati", "Karthik Valmeekam", "Lin Guan", "Mudit Verma", "Kaya Stechly", "Siddhant Bhambri", "Lucas Paul Saldyt", "Anil Bhaskar Murthy"], 335 "year": 2024, 336 "relevance": "Key position paper arguing LLMs cannot plan autonomously, directly contrasted by this paper's findings on incremental improvements." 337 }, 338 { 339 "title": "Planning in Strawberry Fields: Evaluating and Improving the Planning and Scheduling Capabilities of LRM o1", 340 "authors": ["Karthik Valmeekam", "Kaya Stechly", "Atharva Gundawar", "Subbarao Kambhampati"], 341 "year": 2024, 342 "arxiv_id": "2410.02162", 343 "relevance": "Evaluates OpenAI o1's planning capabilities, finding even advanced reasoning models fail on long-term planning tasks." 344 }, 345 { 346 "title": "Chain-of-thought prompting elicits reasoning in large language models", 347 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Fei Xia", "Ed Chi", "Quoc V. Le", "Denny Zhou"], 348 "year": 2022, 349 "relevance": "Foundational CoT prompting paper whose strategy is evaluated and found to have limited effectiveness in planning tasks." 350 }, 351 { 352 "title": "Training Language Models to Self-Correct via Reinforcement Learning", 353 "authors": ["Aviral Kumar", "Vincent Zhuang", "Rishabh Agarwal", "Yi Su", "John D. Co-Reyes", "Avi Singh", "Kate Baumli", "Shariq Iqbal", "Colton Bishop", "Rebecca Roelofs"], 354 "year": 2024, 355 "arxiv_id": "2409.12917", 356 "relevance": "Self-correction learning strategy evaluated in this paper; effective for math but found ineffective for plan validity." 357 }, 358 { 359 "title": "Learning General Policies for Planning through GPT Models", 360 "authors": ["Nicola Rossetti", "Marco Tummolo", "Alfonso E. Gerevini", "Luca Putelli", "Ivan Serina", "Mattia Chiari", "Matteo Olivato"], 361 "year": 2024, 362 "relevance": "PlanGPT model: most closely related prior work showing fine-tuned LLMs can generate plans, but evaluated only in-distribution." 363 }, 364 { 365 "title": "PlanBench: An Extensible Benchmark for Evaluating Large Language Models on Planning and Reasoning about Change", 366 "authors": ["Karthik Valmeekam", "Matthew Marquez", "Alberto Olmo", "Sarath Sreedharan", "Subbarao Kambhampati"], 367 "year": 2024, 368 "relevance": "The primary benchmark extended and used in this paper for evaluating LLM planning capabilities." 369 }, 370 { 371 "title": "Back to basics: Revisiting reinforce style optimization for learning from human feedback in llms", 372 "authors": ["Arash Ahmadian", "Chris Cremer", "Matthias Gallé", "Marzieh Fadaee", "Julia Kreutzer", "Ahmet Üstün", "Sara Hooker"], 373 "year": 2024, 374 "arxiv_id": "2402.14740", 375 "relevance": "Provides the RLOO framework used for sequence-level RL optimization in this paper." 376 }, 377 { 378 "title": "GSM-Symbolic: Understanding the Limitations of Mathematical Reasoning in Large Language Models", 379 "authors": ["Iman Mirzadeh", "Keivan Alizadeh", "Hooman Shahrokhi", "Oncel Tuzel", "Samy Bengio", "Mehrdad Farajtabar"], 380 "year": 2024, 381 "arxiv_id": "2410.05229", 382 "relevance": "Demonstrates limitations of LLM reasoning on out-of-distribution problems, supporting this paper's OOD evaluation design." 383 }, 384 { 385 "title": "Physics of Language Models: Part 2.2, How to Learn From Mistakes on Grade-School Math Problems", 386 "authors": ["Tian Ye", "Zicheng Xu", "Yuanzhi Li", "Zeyuan Allen-Zhu"], 387 "year": 2024, 388 "relevance": "Self-correction learning through error exposure, adapted in this paper for planning but found ineffective for plan validity." 389 }, 390 { 391 "title": "Chain of thoughtlessness: An analysis of CoT in planning", 392 "authors": ["Kaya Stechly"], 393 "year": 2024, 394 "arxiv_id": "2405.04776", 395 "relevance": "Found CoT prompting did not improve LLM planner validity, a finding this paper partially contradicts by showing executability improvements." 396 }, 397 { 398 "title": "The pitfalls of next-token prediction", 399 "authors": ["Gregor Bachmann"], 400 "year": 2024, 401 "arxiv_id": "2403.06963", 402 "relevance": "Theoretical analysis of snowballing error in autoregressive models, providing the theoretical basis for why LLM planning degrades with sequence length." 403 }, 404 { 405 "title": "Qwen2 technical report", 406 "authors": ["An Yang", "Baosong Yang", "Binyuan Hui", "Bo Zheng", "Bowen Yu", "Chang Zhou"], 407 "year": 2024, 408 "arxiv_id": "2407.10671", 409 "relevance": "Technical report for the Qwen2 model used as the base LLM in this paper's experiments." 410 } 411 ] 412 }