scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24500B)
      1 {
      2   "paper": {
      3     "title": "AdaPlanner: Adaptive Planning from Feedback with Language Models",
      4     "authors": [
      5       "Haotian Sun",
      6       "Yuchen Zhuang",
      7       "Lingkai Kong",
      8       "Bo Dai",
      9       "Chao Zhang"
     10     ],
     11     "year": 2023,
     12     "venue": "arXiv",
     13     "arxiv_id": "2305.16653"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract states 'The implementation of AdaPlanner is available on https://github.com/haotiansun14/AdaPlanner', providing a working GitHub URL."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper uses publicly available standard benchmarks: ALFWorld and MiniWoB++, both of which are publicly released environments. The paper does not collect proprietary data."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper does not include a requirements.txt, Dockerfile, or detailed environment setup section listing library versions. The appendix describes experimental setup at a high level but does not provide enough detail to recreate the environment."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "While a GitHub link is provided, the paper itself contains no step-by-step reproduction instructions. The appendix describes prompts and experimental configurations but does not provide commands or scripts to replicate experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Tables 2 and 3 report success rates as point estimates only (e.g., '91.79%'). No confidence intervals, error bars, or uncertainty measures are reported for any result."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims AdaPlanner 'outperforms state-of-the-art baselines by 3.73% and 4.11%' but provides no statistical significance tests (no p-values, t-tests, or equivalent) to support comparative claims."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports percentage improvements with baseline context: '3.73% improvement' from 88.06% to 91.79% (ALFWorld) and '4.11% improvement' from ~88.76% to 92.87% (MiniWoB++). The schema description explicitly states that 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' Tables 2 and 3 provide the raw baseline scores alongside AdaPlanner scores, giving readers full effect magnitude context."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The 134 ALFWorld tasks and 53 MiniWoB++ tasks are described but there is no justification for why these specific numbers were chosen, and no power analysis is provided."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "All results in Tables 2 and 3 are single point estimates. The paper does not report results averaged over multiple runs with any spread measures (standard deviation, IQR, etc.), making result stability impossible to assess."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper compares against multiple baselines including BUTLER, ReAct, Reflexion (ALFWorld) and CC-Net, WGE, WebN-T5-3B, RCI (MiniWoB++), covering both training-based and prompting-based methods."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines include contemporaneous work (Reflexion, RCI from 2023; ReAct from 2023), and the comparison includes work published within the same year as the paper."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Figure 4 includes ablation studies testing: (c) AdaPlanner with and without the code interface (CI), and (d) AdaPlanner with and without skill discovery (SD), demonstrating the contribution of each component."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper uses only success rate (%) as the evaluation metric across all experiments. Section 8.1 states 'we use success rate (%) to evaluate the performance of tested methods' and no other metrics are used."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "The evaluation is entirely automated through task success/failure in simulated environments (ALFWorld, MiniWoB++). Human evaluation is not relevant to these automated benchmark claims."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper evaluates on standard benchmark test sets (134 ALFWorld tasks and 53 MiniWoB++ tasks) following prior work. Demonstrations used for prompting are separate from evaluation tasks."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 2 provides per-category results for all six ALFWorld task types (Pick, Clean, Heat, Cool, Examine, Pick two). Table 3 separates MiniWoB++ into feedback and no-feedback subsets."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "The paper does not discuss failure cases or provide error analysis. The conclusion section mentions one limitation (needing few-shot demonstrations) but no analysis of where the approach fails."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The ablation studies (Figure 4c, 4d) show performance drops when components are removed. Table 2 also shows that AdaPlanner with gpt-3.5-turbo underperforms relative to text-davinci-002 in some tasks, which the paper discusses in Section 4."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims AdaPlanner outperforms baselines by 3.73% and 4.11% while using 2x and 600x fewer samples. These claims are supported by Tables 2, 3, and Figure 3 in the paper."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper makes causal claims through ablation studies (e.g., 'without the code interface, AdaPlanner's performance substantially drops'), which are supported by controlled single-variable manipulation ablations shown in Figure 4c and 4d."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper tests on two specific text-based environments (ALFWorld and MiniWoB++) but makes broader claims about 'sequential decision-making tasks' in general. The conclusion does not bound claims to these specific environments."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for the results. For instance, performance differences could be attributed to prompt engineering rather than the closed-loop mechanism, but this is not addressed."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper specifies exact model identifiers: 'gpt-3.5-turbo', 'text-davinci-002', and 'text-davinci-003' are explicitly named in Tables 2 and 3 and Section 8.2."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Appendix 8.3 provides the full prompt text for all stages (basic_info, initial_planning, code_check, refinement, start_from determination) with actual example solutions for all six ALFWorld task types and MiniWoB++ tasks."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper does not report temperature, top-p, max tokens, or other sampling hyperparameters for the LLM API calls. These settings are not mentioned anywhere in the paper or appendix."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper describes the agentic scaffolding in detail: the planner/refiner architecture, in-plan and out-of-plan refinement mechanisms, the ask_LLM() action, assertion-based error detection, and the refine-then-resume mechanism are all described in Sections 3.1-3.3."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The paper describes how the environments are set up, how tasks are selected (9 MiniWoB++ tasks with feedback, 53 tasks from RCI evaluation), and how demonstrations are allocated (Table 4). The skill discovery pipeline is also described in Section 3.3."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 6 'Conclusion and Limitations' includes a dedicated limitations discussion, noting that AdaPlanner still requires few-shot expert demonstrations for complex tasks."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The limitations section mentions only one limitation (need for few-shot demonstrations) and does not discuss threats to validity such as benchmark contamination, sensitivity to prompt phrasing, or the limited scope of the two test environments."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the results do NOT show. There is no statement bounding results to text-based environments or these specific benchmarks, and the paper implies broader applicability without bounding the claims."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "While the GitHub repository is provided, the paper does not describe providing raw experimental logs or trajectories for independent verification of the reported success rates."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The experimental setup (Section 8.1) describes the environments used, task types, and how demonstrations were collected (38 human-written + 21 discovered by skill discovery for MiniWoB++, 6 expert samples for ALFWorld)."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. The evaluation uses automated environments (ALFWorld, MiniWoB++) with no human subjects."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 4 and Appendix 8.1-8.2 describe the evaluation pipeline: how tasks are run, how success is determined, and how skill discovery produces additional examples. The pipeline from task assignment to success measurement is sufficiently documented."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No acknowledgments section or funding disclosure is present in the paper. There is no mention of grants, corporate sponsors, or funding agencies."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All five authors are affiliated with Georgia Institute of Technology, as clearly stated on the first page. No authors appear to be affiliated with OpenAI or other providers of the evaluated models."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": false,
    213         "answer": false,
    214         "justification": "No funding is disclosed, so independence cannot be assessed. This item is inapplicable in the absence of any funding disclosure."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "There is no competing interests statement or financial disclosure in the paper. Absence of disclosure is not absence of conflict."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper uses GPT-3 (text-davinci-002), GPT-3.5 (text-davinci-003, gpt-3.5-turbo) but does not state the training data cutoff dates for any of these models."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The paper does not discuss whether the ALFWorld or MiniWoB++ benchmarks or their task descriptions may have been present in the GPT-3/3.5 training data."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "ALFWorld (2021) and MiniWoB++ (2018) were both published before GPT-3/3.5 training data cutoffs. The paper does not address the possibility that task descriptions or solutions may have appeared in training data."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in this study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The paper does not report API costs, token counts, or wall-clock time per episode. The paper mentions reducing LLM calls as a benefit but does not quantify actual inference costs."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No total compute budget, GPU hours, or API spend is reported. The paper does not quantify the computational resources required to run the experiments."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "AdaPlanner outperforms state-of-the-art baselines by 3.73% in ALFWorld and 4.11% in MiniWoB++ while using 2x and 600x fewer samples, respectively.",
    292       "evidence": "Table 2 shows AdaPlanner (GPT-3) achieves 91.79% vs. Reflexion's 88.06% in ALFWorld. Table 3 shows AdaPlanner achieves 92.87% vs. RCI's 91.00% in MiniWoB++. Figure 3 shows sample efficiency comparison.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "The code interface (CI) is essential: removing it causes performance to drop from 81% to 46% in ALFWorld and from 93% to 66% in MiniWoB++.",
    297       "evidence": "Figure 4c shows ablation with and without code interface for both environments. The comparison is controlled—only the code interface is removed.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Skill discovery significantly enhances performance: in ALFWorld, success rate nearly doubles with skill discovery (19% to 38%), and in MiniWoB++ increases by about 15% (45% to 60%).",
    302       "evidence": "Figure 4d shows ablation with and without skill discovery under zero-shot/one-shot settings. Section 4 describes these results.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "AdaPlanner's closed-loop refinement consistently improves performance with each additional refinement iteration, even with as few as 2 demonstrations total.",
    307       "evidence": "Figure 4a shows success rate increasing with number of closed-loop corrections across different sample counts (2, 4, 6 samples).",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "gpt-3.5-turbo underperforms text-davinci-002 on decision-making tasks due to greater hallucination, but AdaPlanner's code prompts mitigate this.",
    312       "evidence": "Table 2 shows AdaPlanner (GPT-3) at 91.79% vs. AdaPlanner (GPT-3.5) at 80.60%. Section 4 discusses this observation. However, 'gpt-3.5-turbo might be a smaller-scale model' is presented as hypothesis, not established fact.",
    313       "supported": "weak"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "benchmark-eval"
    318   ],
    319   "key_findings": "AdaPlanner is a closed-loop LLM agent framework that uses code-based prompting and adaptive plan refinement (in-plan and out-of-plan) to outperform prior baselines on ALFWorld (91.79% success rate) and MiniWoB++ (92.87% success rate) while requiring fewer demonstrations. Ablation studies show that both the code interface and skill discovery module contribute substantially to performance. The approach operates entirely through prompting without task-specific training, making it broadly applicable.",
    320   "red_flags": [
    321     {
    322       "flag": "No variance or uncertainty quantification",
    323       "detail": "All results are reported as single point estimates with no standard deviation, confidence intervals, or indication of how many runs were conducted. Result stability across random seeds or multiple runs is entirely unknown."
    324     },
    325     {
    326       "flag": "No statistical significance testing",
    327       "detail": "Comparative claims (e.g., outperforming baselines by 3.73%) are made without any statistical tests. The margins are small and statistical significance is never assessed."
    328     },
    329     {
    330       "flag": "Benchmark contamination unaddressed",
    331       "detail": "ALFWorld (2021) and MiniWoB++ (2018) predate the training data cutoffs of the GPT-3/3.5 models used. The paper does not discuss whether task descriptions, solutions, or environment behavior could have appeared in training data."
    332     },
    333     {
    334       "flag": "Only success rate reported",
    335       "detail": "The paper uses a single metric (success rate) across all experiments, which obscures variation in efficiency, number of steps required, and partial task completion."
    336     },
    337     {
    338       "flag": "Missing hyperparameters",
    339       "detail": "Temperature, top-p, max tokens, and other LLM sampling hyperparameters are never reported, making exact reproduction impossible and affecting result interpretation."
    340     },
    341     {
    342       "flag": "No failure case analysis",
    343       "detail": "The paper does not analyze failure cases or discuss where AdaPlanner breaks down. The limitations section mentions only the need for few-shot demonstrations without discussing systematic failure modes."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    349       "authors": [
    350         "Shunyu Yao",
    351         "Jeffrey Zhao",
    352         "Dian Yu",
    353         "Nan Du",
    354         "Izhak Shafran",
    355         "Karthik Narasimhan",
    356         "Yuan Cao"
    357       ],
    358       "year": 2023,
    359       "relevance": "Foundational paper on LLM agents combining reasoning traces with action-taking, directly compared as a baseline."
    360     },
    361     {
    362       "title": "Reflexion: an Autonomous Agent with Dynamic Memory and Self-Reflection",
    363       "authors": [
    364         "Noah Shinn",
    365         "Beck Labash",
    366         "Ashwin Gopinath"
    367       ],
    368       "year": 2023,
    369       "arxiv_id": "2303.11366",
    370       "relevance": "Key baseline for LLM agents with self-reflection and memory, compared directly on ALFWorld."
    371     },
    372     {
    373       "title": "Language Models Can Solve Computer Tasks",
    374       "authors": [
    375         "Geunwoo Kim",
    376         "Pierre Baldi",
    377         "Stephen McAleer"
    378       ],
    379       "year": 2023,
    380       "arxiv_id": "2303.17491",
    381       "relevance": "RCI method: baseline implicit closed-loop LLM agent for computer tasks, directly compared on MiniWoB++."
    382     },
    383     {
    384       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    385       "authors": [
    386         "Jason Wei",
    387         "Xuezhi Wang",
    388         "Dale Schuurmans",
    389         "Maarten Bosma",
    390         "Brian Ichter",
    391         "Fei Xia",
    392         "Ed Chi",
    393         "Quoc Le",
    394         "Denny Zhou"
    395       ],
    396       "year": 2022,
    397       "arxiv_id": "2201.11903",
    398       "relevance": "Foundational open-loop LLM planning method used as a baseline reference."
    399     },
    400     {
    401       "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning",
    402       "authors": [
    403         "Mohit Shridhar",
    404         "Xingdi Yuan",
    405         "Marc-Alexandre Cote",
    406         "Yonatan Bisk",
    407         "Adam Trischler",
    408         "Matthew Hausknecht"
    409       ],
    410       "year": 2021,
    411       "relevance": "Primary benchmark environment used for evaluation; essential reference for the evaluation design."
    412     },
    413     {
    414       "title": "PAL: Program-aided Language Models",
    415       "authors": [
    416         "Luyu Gao",
    417         "Aman Madaan",
    418         "Shuyan Zhou",
    419         "Uri Alon",
    420         "Pengfei Liu",
    421         "Yiming Yang",
    422         "Jamie Callan",
    423         "Graham Neubig"
    424       ],
    425       "year": 2022,
    426       "arxiv_id": "2211.10435",
    427       "relevance": "Related approach using code-based prompting for LLM reasoning, motivates AdaPlanner's code interface design."
    428     },
    429     {
    430       "title": "Code as Policies: Language Model Programs for Embodied Control",
    431       "authors": [
    432         "Jacky Liang",
    433         "Wenlong Huang",
    434         "Fei Xia",
    435         "Peng Xu",
    436         "Karol Hausman",
    437         "Brian Ichter",
    438         "Pete Florence",
    439         "Andy Zeng"
    440       ],
    441       "year": 2022,
    442       "arxiv_id": "2209.07753",
    443       "relevance": "Related code-based prompting method for robot control, relevant to AdaPlanner's code interface approach."
    444     },
    445     {
    446       "title": "Describe, Explain, Plan and Select: Interactive Planning with Large Language Models Enables Open-World Multi-Task Agents",
    447       "authors": [
    448         "Zihao Wang",
    449         "Shaofei Cai",
    450         "Anji Liu",
    451         "Xiaojian Ma",
    452         "Yitao Liang"
    453       ],
    454       "year": 2023,
    455       "arxiv_id": "2302.01560",
    456       "relevance": "DEPS: only prior method that modifies entire plans based on feedback, directly motivates AdaPlanner's design."
    457     },
    458     {
    459       "title": "Inner Monologue: Embodied Reasoning through Planning with Language Models",
    460       "authors": [
    461         "Wenlong Huang",
    462         "Fei Xia",
    463         "Ted Xiao",
    464         "Harris Chan",
    465         "Jacky Liang",
    466         "Pete Florence",
    467         "Andy Zeng",
    468         "Jonathan Tompson",
    469         "Igor Mordatch",
    470         "Yevgen Chebotar",
    471         "Pierre Sermanet",
    472         "Noah Brown",
    473         "Tsung-Yen Yang",
    474         "Sichun Luu",
    475         "Sergey Levine",
    476         "Karol Hausman",
    477         "Brian Ichter"
    478       ],
    479       "year": 2022,
    480       "arxiv_id": "2207.05608",
    481       "relevance": "Baseline implicit closed-loop LLM agent using environmental feedback for single-step action selection."
    482     },
    483     {
    484       "title": "Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents",
    485       "authors": [
    486         "Wenlong Huang",
    487         "Pieter Abbeel",
    488         "Deepak Pathak",
    489         "Igor Mordatch"
    490       ],
    491       "year": 2022,
    492       "arxiv_id": "2201.07207",
    493       "relevance": "Open-loop planning baseline demonstrating limitations of LLMs without feedback adaptation."
    494     }
    495   ]
    496 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs