scan.json (24848B)
1 { 2 "paper": { 3 "title": "Augmented Language Models: a Survey", 4 "authors": [ 5 "Grégoire Mialon", 6 "Roberto Dessì", 7 "Maria Lomeli", 8 "Christoforos Nalmpantis", 9 "Ram Pasunuru", 10 "Roberta Raileanu", 11 "Baptiste Rozière", 12 "Timo Schick", 13 "Jane Dwivedi-Yu", 14 "Asli Celikyilmaz", 15 "Edouard Grave", 16 "Yann LeCun", 17 "Thomas Scialom" 18 ], 19 "year": 2023, 20 "venue": "arXiv preprint", 21 "arxiv_id": "2302.07842" 22 }, 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No code repository or analysis scripts are provided. The paper is a narrative survey with no released artifacts." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": false, 33 "justification": "No dataset, corpus of reviewed papers, or structured extraction of surveyed works is released. A survey can release its search corpus or extracted data tables, but this one does not." 34 }, 35 "environment_specified": { 36 "applies": false, 37 "answer": false, 38 "justification": "This is a survey paper with no computational experiments, so environment specifications are structurally inapplicable." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No instructions are provided for reproducing the survey's paper selection, classification, or analysis. The paper does not describe a systematic search methodology that could be replicated." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": false, 49 "answer": false, 50 "justification": "This is a narrative survey that does not run experiments or perform statistical aggregation. No original quantitative results are reported." 51 }, 52 "significance_tests": { 53 "applies": false, 54 "answer": false, 55 "justification": "No experiments or statistical comparisons are conducted by the authors. Tables (e.g., Table 1 on GSM8K) reproduce results from other papers." 56 }, 57 "effect_sizes_reported": { 58 "applies": false, 59 "answer": false, 60 "justification": "No original experiments are conducted. The survey reproduces results from other works without performing meta-analytic aggregation." 61 }, 62 "sample_size_justified": { 63 "applies": false, 64 "answer": false, 65 "justification": "No experiments or data collection are performed by the authors. This is a narrative review." 66 }, 67 "variance_reported": { 68 "applies": false, 69 "answer": false, 70 "justification": "No original experiments are conducted. No statistical aggregation across reviewed papers is performed." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": false, 77 "justification": "The survey does not compare itself against prior surveys or provide a structured comparison framework showing how it differs from or improves upon existing surveys (e.g., Huang and Chang 2022 or Qiao et al. 2022, which are mentioned as related surveys on reasoning)." 78 }, 79 "baselines_contemporary": { 80 "applies": false, 81 "answer": false, 82 "justification": "No experimental baselines are applicable to a narrative survey paper." 83 }, 84 "ablation_study": { 85 "applies": false, 86 "answer": false, 87 "justification": "No system or method is proposed, so ablation is not applicable." 88 }, 89 "multiple_metrics": { 90 "applies": false, 91 "answer": false, 92 "justification": "No evaluation of any system is conducted by the authors." 93 }, 94 "human_evaluation": { 95 "applies": false, 96 "answer": false, 97 "justification": "No system outputs are produced that would require human evaluation. This is a literature review." 98 }, 99 "held_out_test_set": { 100 "applies": false, 101 "answer": false, 102 "justification": "No experiments are conducted. Not applicable to a survey." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "The survey organizes reviewed work into categories: reasoning (Section 2), tools and acting (Section 3), and learning methods (Section 4), with subcategories such as prompting approaches, recursive prompting, retrieval-augmented models, code interpreters, physical/virtual agents, supervision, and RL. Table 1 provides per-method breakdowns on GSM8K and Table 2 compares retrieval-augmented models." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 2.4 discusses limitations of abstract reasoning, noting there is no guarantee intermediate steps are valid and reasoning may not actually be used by the model. Section 4.3 discusses limitations of current RL methods including instability. Section 5 discusses ethical concerns and limitations of ALMs." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports negative findings from reviewed work: that the compositionality gap does not narrow with model size (Section 2.1, Press et al. 2022), that chain-of-thought can lead to correct predictions despite nonsensical intermediate reasoning (Section 5, Lewkowycz et al. 2022), and that there is little evidence retrieval-augmented LMs improve truthfulness (Section 5, Krishna et al. 2021)." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims the survey 'reviews works in which language models are augmented with reasoning skills and the ability to use tools' and concludes the direction 'has the potential to address common limitations.' The body comprehensively covers reasoning (Section 2), tools (Section 3), and learning approaches (Section 4), supporting both claims. The potential-to-address claim is appropriately hedged." 125 }, 126 "causal_claims_justified": { 127 "applies": false, 128 "answer": false, 129 "justification": "The survey does not make original causal claims. It describes findings and mechanisms from reviewed papers but does not itself claim causal relationships based on its own analysis." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper's title and framing suggest broad coverage ('Augmented Language Models: a Survey') but the scope boundaries are not explicitly stated. The paper does not clearly specify what is excluded — for example, vision-language models are partially covered, robotics is partially covered, but the inclusion/exclusion criteria for the survey are never stated. Section 1.2 notes 'Other axes could naturally have been chosen' but does not specify what was excluded or why." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": false, 138 "answer": false, 139 "justification": "As a survey paper presenting no original empirical results, alternative explanations for its own findings are not applicable. The survey does note alternative interpretations of reviewed work (e.g., whether LMs truly reason or just produce helpful context, Section 2.4)." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": false, 145 "answer": false, 146 "justification": "No models are used by the authors. This is a survey paper." 147 }, 148 "prompts_provided": { 149 "applies": false, 150 "answer": false, 151 "justification": "No prompting experiments are conducted by the authors. Prompt examples shown (Figures 1-6) are reproduced from reviewed papers." 152 }, 153 "hyperparameters_reported": { 154 "applies": false, 155 "answer": false, 156 "justification": "No experiments are conducted by the authors." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used by the authors. This is a survey paper." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": false, 166 "justification": "The survey does not describe how papers were selected for inclusion. There is no search methodology, database queries, inclusion/exclusion criteria, or filtering pipeline described. The reader cannot understand or reproduce the paper selection process." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "There is no dedicated limitations section for the survey itself. Section 4.3 discusses 'Limitations and future directions' of the reviewed methods (RL instability, data scarcity), and Section 5 includes a Discussion, but neither addresses limitations of the survey's own methodology or coverage." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No threats to validity of the survey itself are discussed. There is no discussion of potential bias in paper selection, coverage gaps, or limitations of the narrative review approach." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The scope is loosely described: 'the survey focuses on works that combine reasoning or tools with LMs' (Section 1.2) and 'we focus on LLMs' but the paper does not clearly state what is excluded or what the results do NOT show. For instance, it is unclear whether multi-modal foundation models, pure RL agents without LM components, or closed-source commercial systems are in scope." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw data (list of all surveyed papers, search queries, extraction spreadsheets) is available. The survey cannot be independently verified for completeness or accuracy of coverage." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": false, 195 "justification": "The paper does not describe how the surveyed papers were identified or collected. There is no mention of search databases, queries, time periods, or systematic search protocols." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants are involved. The paper is a literature survey, not a study with recruited subjects. The data source (research papers) is not a standard benchmark either, so NA applies." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "No data pipeline is documented. The survey does not describe how papers were found, screened, included, or categorized. The classification into reasoning/tools/learning appears to be the authors' taxonomy but the process of arriving at it is not documented." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding source is disclosed. The Acknowledgements section only thanks Marco Baroni for feedback. All authors are Meta AI employees, but no funding statement is provided." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All authors clearly list their Meta AI affiliation and email addresses on the first page. Roberto Dessì's additional affiliation at Universitat Pompeu Fabra is also noted." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "All authors work at Meta AI. Meta has commercial interests in the technologies surveyed (LLMs, tool-augmented AI). The survey reviews several Meta/FAIR papers (e.g., Toolformer, Atlas, PEER, Galactica) alongside other work. The funder (Meta) is not independent of the survey's conclusions about the promise of augmented language models." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is provided. Given that all authors are Meta employees and the survey covers Meta's research extensively, a disclosure would be expected." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "This is a survey paper that does not evaluate any model on benchmarks. Contamination concerns are not applicable." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "No model evaluation is conducted. Not applicable to a survey paper." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "No model evaluation is conducted. Not applicable to a survey paper." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved. This is a literature survey." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants are involved." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants are involved." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "This is a survey paper with no method of its own. Cost reporting is not applicable." 289 }, 290 "compute_budget_stated": { 291 "applies": false, 292 "answer": false, 293 "justification": "This is a survey paper with no computational experiments. Not applicable." 294 } 295 } 296 }, 297 "claims": [ 298 { 299 "claim": "Augmented Language Models (ALMs) that combine reasoning and tool use can address common limitations of traditional LMs such as interpretability, consistency, and scalability issues.", 300 "evidence": "Abstract and Section 5 ('Discussion - Augmented Language Models benefits') list potential advantages: truthfulness through grounding, uncertainty estimation, interpretability via intermediate reasoning steps, and enhanced capabilities. However, the paper acknowledges 'there is surprisingly little evidence' that retrieval-augmented LMs improve truthfulness (Section 5, citing Krishna et al. 2021).", 301 "supported": "weak" 302 }, 303 { 304 "claim": "Chain-of-thought prompting enables LMs to be better reasoners, with the capability emerging at scale (100B+ parameters).", 305 "evidence": "Section 2.1 cites Wei et al. (2022c) for few-shot CoT and Kojima et al. (2022) for zero-shot CoT. Table 1 shows CoT improving GSM8K accuracy from 15.6% to 46.9% (text-davinci-002) and 19.7% to 63.1% (code-davinci-002). Section 2.1 notes emergence requires 100B+ parameters per Wei et al. (2022b).", 306 "supported": "strong" 307 }, 308 { 309 "claim": "Tool-augmented LMs can be seen as a generalization of the non-parametric framework, potentially yielding smaller models that retain capabilities of larger counterparts.", 310 "evidence": "Section 5 ('Generalizing the non-parametric framework') discusses this claim conceptually, citing RETRO and Atlas as examples where retrieval reduces parameter requirements. Table 2 compares retrieval-augmented models. However, this is presented as a conjecture about future scaling laws rather than an empirically validated claim.", 311 "supported": "weak" 312 }, 313 { 314 "claim": "Combining reasoning and tool use via reinforcement learning from human feedback (RLHF) yields models that outperform humans in question-answering tasks.", 315 "evidence": "Section 4.2 describes WebGPT (Nakano et al. 2021) which, after fine-tuning with RLHF, produced answers preferred over human-generated ones on two QA datasets. This is cited evidence from a single paper, not independently verified in this survey.", 316 "supported": "moderate" 317 }, 318 { 319 "claim": "The compositionality gap in language models does not narrow with increasing model scale.", 320 "evidence": "Section 2.1 cites Press et al. (2022) who 'observe that this gap does not narrow when increasing the size of the model.' This is evidence from a single study on 2-hop questions.", 321 "supported": "moderate" 322 } 323 ], 324 "methodology_tags": [ 325 "meta-analysis" 326 ], 327 "key_findings": "This survey from Meta AI categorizes augmented language model (ALM) research into three axes: reasoning (chain-of-thought prompting, recursive prompting, fine-tuning), tool use (retrieval, search engines, code interpreters, robotic control), and learning methods (supervision, reinforcement learning). The paper argues that combining reasoning and tools allows LMs to depart from pure language modeling toward more capable agents. Key limitations identified include the lack of guarantee that intermediate reasoning steps are valid, instability of RL training methods, and the open question of whether LMs can achieve System 2 reasoning through current approaches.", 328 "red_flags": [ 329 { 330 "flag": "No systematic search methodology", 331 "detail": "The survey does not describe any systematic search protocol, database queries, inclusion/exclusion criteria, or paper selection methodology. This makes the coverage impossible to evaluate for completeness or bias. The reader cannot know what was excluded or why." 332 }, 333 { 334 "flag": "Potential conflict of interest: Meta authors surveying Meta research", 335 "detail": "All 13 authors are Meta AI employees. The survey prominently features Meta/FAIR papers (Toolformer, Atlas, PEER, Galactica, Socratic Models, Flamingo, BlenderBot, etc.) alongside work from other groups. No conflict of interest statement is provided. While the coverage of non-Meta work appears broad, the framing and conclusions may favor Meta's research directions." 336 }, 337 { 338 "flag": "No quality assessment of reviewed papers", 339 "detail": "The survey describes what methods exist and how they work, but does not assess the methodological quality or strength of evidence in the reviewed papers. Results from different papers are presented side-by-side (e.g., Table 1) without noting differences in evaluation methodology, making the survey a potential laundering mechanism for weak results alongside strong ones." 340 }, 341 { 342 "flag": "Speculative conclusions presented as survey findings", 343 "detail": "Several claims in Section 5 and the abstract are speculative (e.g., ALMs 'have the potential to address common limitations') rather than conclusions supported by structured analysis of the evidence. The claim about truthfulness is immediately qualified by noting 'surprisingly little evidence' supports it." 344 } 345 ], 346 "cited_papers": [ 347 { 348 "title": "Chain of thought prompting elicits reasoning in large language models", 349 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Ed Chi", "Quoc Le", "Denny Zhou"], 350 "year": 2022, 351 "arxiv_id": "2201.11903", 352 "relevance": "Foundational work on chain-of-thought prompting for LLM reasoning, central to the survey's reasoning section." 353 }, 354 { 355 "title": "Toolformer: Language models can teach themselves to use tools", 356 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì", "Roberta Raileanu", "Maria Lomeli", "Luke Zettlemoyer", "Nicola Cancedda", "Thomas Scialom"], 357 "year": 2023, 358 "arxiv_id": "2302.04761", 359 "relevance": "Key paper on self-supervised tool learning for LMs, demonstrating LMs can learn when and how to call external tools." 360 }, 361 { 362 "title": "ReAct: Synergizing reasoning and acting in language models", 363 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], 364 "year": 2022, 365 "arxiv_id": "2210.03629", 366 "relevance": "Influential work on interleaving reasoning and acting in LMs, combining chain-of-thought with tool use for decision making." 367 }, 368 { 369 "title": "PAL: Program-aided language models", 370 "authors": ["Luyu Gao", "Aman Madaan", "Shuyan Zhou", "Uri Alon", "Pengfei Liu", "Yiming Yang", "Jamie Callan", "Graham Neubig"], 371 "year": 2022, 372 "relevance": "Demonstrates offloading computation to a Python interpreter via CoT-style prompting, achieving strong results on reasoning benchmarks." 373 }, 374 { 375 "title": "WebGPT: Browser-assisted question-answering with human feedback", 376 "authors": ["Reiichiro Nakano", "Jacob Hilton", "Suchir Balaji", "Jeff Wu", "Long Ouyang"], 377 "year": 2021, 378 "arxiv_id": "2112.09332", 379 "relevance": "Early work on training LMs to use web browsers via RLHF, demonstrating tool-augmented QA that surpasses human performance." 380 }, 381 { 382 "title": "Improving language models by retrieving from trillions of tokens", 383 "authors": ["Sebastian Borgeaud", "Arthur Mensch", "Jordan Hoffmann"], 384 "year": 2022, 385 "relevance": "RETRO: key retrieval-augmented LM showing non-parametric memory can match larger model capabilities with fewer parameters." 386 }, 387 { 388 "title": "Atlas: Few-shot learning with retrieval augmented language models", 389 "authors": ["Gautier Izacard", "Patrick Lewis", "Maria Lomeli"], 390 "year": 2022, 391 "arxiv_id": "2208.03299", 392 "relevance": "Jointly trains retriever and seq2seq model for few-shot learning, demonstrating strong performance despite being much smaller than other LLMs." 393 }, 394 { 395 "title": "Training language models to follow instructions with human feedback", 396 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang", "Diogo Almeida"], 397 "year": 2022, 398 "arxiv_id": "2203.02155", 399 "relevance": "InstructGPT paper on RLHF for instruction following, foundational to the learning methods discussed in the survey." 400 }, 401 { 402 "title": "Do as I can, not as I say: Grounding language in robotic affordances", 403 "authors": ["Michael Ahn", "Anthony Brohan", "Noah Brown"], 404 "year": 2022, 405 "arxiv_id": "2204.01691", 406 "relevance": "SayCan: combines LM planning with robot affordances for physical task execution, key paper on LMs acting in the physical world." 407 }, 408 { 409 "title": "Galactica: A large language model for science", 410 "authors": ["Ross Taylor", "Marcin Kardas", "Guillem Cucurull", "Thomas Scialom"], 411 "year": 2022, 412 "arxiv_id": "2211.09085", 413 "relevance": "Demonstrates prompt pre-training with explicit reasoning tokens for scientific reasoning, a key approach to teaching LMs to reason." 414 }, 415 { 416 "title": "Evaluating large language models trained on code", 417 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 418 "year": 2021, 419 "relevance": "Codex paper: foundational to code generation and tool-use capabilities of LMs, enabling code-based reasoning and tool invocation." 420 }, 421 { 422 "title": "STaR: Self-taught reasoner bootstrapping reasoning with reasoning", 423 "authors": ["Eric Zelikman", "Jesse Mu", "Noah D. Goodman", "Yuhuai Tony Wu"], 424 "year": 2022, 425 "relevance": "Key bootstrapping approach for teaching LMs to reason without extensive human demonstrations, bridging supervision and RL." 426 }, 427 { 428 "title": "Language models are few-shot learners", 429 "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"], 430 "year": 2020, 431 "relevance": "GPT-3 paper establishing few-shot prompting as a paradigm for LM capability, foundational to both reasoning and tool-use research." 432 } 433 ] 434 }