scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24372B)
      1 {
      2   "paper": {
      3     "title": "StateFlow: Enhancing LLM Task-Solving through State-Driven Workflows",
      4     "authors": ["Yiran Wu", "Tianwei Yue", "Shaokun Zhang", "Chi Wang", "Qingyun Wu"],
      5     "year": 2024,
      6     "venue": "COLM 2024",
      7     "arxiv_id": "2403.11322",
      8     "doi": "10.48550/arXiv.2403.11322"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "StateFlow models LLM task-solving as finite state machines, achieving 13% and 28% higher success rates than ReAct on InterCode SQL and ALFWorld respectively, with 3-5x less inference cost. The framework decomposes long prompts into state-specific shorter prompts, reducing token usage while improving focus. Combining StateFlow with Reflexion further improves ALFWorld performance from 84.3% to 94.8%. Ablation studies show the Observe and Error states are the most important components.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Code is available at https://github.com/yiranwu0/StateFlow (stated in Appendix A.1)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available benchmarks: InterCode (SQL and Bash) and ALFWorld. The InterCode repository is cited at https://github.com/princeton-nlp/intercode."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. Only the AutoGen version (v0.2.17) is mentioned."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The code repository is linked but no README or reproduction guide is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates without confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims StateFlow 'outperforms' baselines based solely on comparing numbers without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context, e.g., '13% higher success rate compared to ReAct' (from 50.68% to 63.73%), and '5× less cost'. Tables 1-4 provide full baseline comparisons."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for using the full InterCode SQL (1034 tasks), Bash (200 tasks), or ALFWorld (134 tasks) benchmarks. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "For ALFWorld, results are 'average success rate of 3 attempts' (Table 4) but no standard deviation or variance is reported across those attempts. InterCode results appear to be single runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple baselines are compared: Plan & Solve, ReAct, ReAct Refined (for SQL), ALFChat with 2 and 3 agents (for ALFWorld). Tables 1, 2, 4."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include ReAct (2022), Plan & Solve (2023), ALFChat/AutoGen (2023), and Reflexion (2023), all contemporary at the time of writing."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 3 presents an ablation study removing individual states (Verify, Error, Observe) from the SQL StateFlow model, showing each component's contribution."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are reported: Success Rate, Error Rate, Turns, Cost (Tables 1-2), and Reward (Tables 6-7)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is included. All evaluation is automated via benchmark metrics (success rate, reward)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses established benchmarks (InterCode SQL/Bash, ALFWorld) with their standard evaluation splits."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "SQL results are broken down by difficulty level (Easy, Medium, Hard, Extra Hard) in Table 5. ALFWorld results are broken down by task type (Pick, Clean, Heat, Cool, Look, Pick 2) in Table 4."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4.2 and Appendix B.2 analyze ALFWorld failures: 15/21 failed tasks ended in Pick state, with three failure reasons identified (hallucination, wrong object, loops). Table 9 classifies failures by ending state."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that GPT-4 did not improve over GPT-3.5 on Bash tasks ('Switching to GPT-4-Turbo has little effect on the methods, where the two baselines even suffer from a decrement in accuracy'). Also, GPT-3.5-Instruct showed performance drops for both methods (Table 12)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 13% and 28% improvement over ReAct and 5x/3x cost reduction are supported by Tables 1 and 4. The Reflexion integration claim (84.3% to 94.8%) is supported by Figure 5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (Table 3) provides controlled single-variable manipulation to support causal claims about which components matter. Each state is removed individually while keeping others constant."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Enhancing LLM Task-Solving' broadly, but results are limited to three specific benchmarks (InterCode SQL/Bash, ALFWorld) with primarily GPT-3.5 and GPT-4. The paper does not bound its claims to these specific task types."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for the improvements. For example, the cost reduction may largely come from shorter prompts rather than the state machine structure itself. The ReAct Refined baseline suggests prompt engineering alone accounts for much of the gain, but this is not explored."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures success rate, error rate, turns, and cost on specific benchmarks and reports them as such without inflating claims beyond what these metrics measure."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper specifies 'GPT-3.5-Turbo' and 'GPT-4-Turbo (both with the 1106 version)' in Section 4.1, which is a specific API version."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text for all states is provided in the appendix: Tables 13-19 contain the actual instructions used for SQL, Bash, and ALFWorld tasks."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 states 'temperature is set to 0' and 'max of 10 rounds of interaction' for InterCode. Section 4.2 states 'maximum of 50 rounds' for ALFWorld."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The entire paper describes the StateFlow scaffolding in detail: state definitions, transition rules, output functions, and Algorithm 1 provides the pseudocode. Figures 1, 3, 4 show the state machine diagrams."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper uses standard benchmarks as-is. Section 4.1 describes the InterCode setup (MySQL in Docker) and evaluation protocol. ALFWorld setup is described in Appendix B.1 with BLEU metric mapping."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The conclusion briefly mentions that 'StateFlow requires humans to have a good understanding of a given task' but this is a single sentence, not substantive discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. The paper does not address potential confounds such as prompt engineering effects, benchmark-specific overfitting, or the manual effort required for state machine design."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do not show or what settings are excluded. It does not bound claims to the tested benchmarks."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental outputs (model responses, execution traces) are released. Only aggregate metrics are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The benchmarks are well-described: InterCode-SQL adapts Spider (1034 tasks), InterCode-Bash has 200 tasks from NL2Bash, ALFWorld has 134 tasks across 6 types. Docker setup is described."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The evaluation pipeline is documented: benchmarks → model interaction with environment → reward calculation. BLEU metric mapping for ALFWorld is described (Appendix B.1)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed. Acknowledgements only thank reviewers and a figure designer."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Pennsylvania State University, MathGPTPro, and Microsoft Research Redmond."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Co-author Chi Wang is from Microsoft Research. The paper uses OpenAI models and the AutoGen framework (from Microsoft). The relationship between Microsoft and the framework being promoted is not discussed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present. Chi Wang is from Microsoft Research, and the implementation is based on AutoGen (a Microsoft project), but this potential conflict is not declared."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No mention of GPT-3.5 or GPT-4's training data cutoff dates."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the benchmark tasks (Spider SQL, NL2Bash, ALFWorld) appeared in GPT model training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Spider was published in 2018, NL2Bash earlier, and ALFWorld in 2020 — all likely in GPT-3.5/GPT-4 training data. This contamination risk is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "LLM API costs in US dollars are reported for all methods across all benchmarks (Tables 1, 2, 4, 6, 7). Token counts (prompt and completion) are also reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Total computational budget (e.g., total API spend for all experiments, hardware used) is not stated. Per-method costs are given but not the total experimental budget."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Temperature is set to 0 (deterministic), but ALFWorld results are averaged over 3 attempts without reporting variance across those attempts. No seed sensitivity analysis."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "ALFWorld: 'We report average success rate of 3 attempts' (Table 4). InterCode appears to be single runs with temperature 0."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The state machine designs and prompts appear hand-crafted but no mention of how many configurations were tried."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The state machine designs are presented without explaining how they were selected or iterated upon. The 10-state ALFWorld variant (Appendix B.3) improves over 7-state, suggesting configurations were explored but the process is not documented."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors compare their StateFlow system against their own implementations/adaptations of baselines (ReAct Refined, ALFChat) without acknowledging self-comparison bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Cost is reported alongside performance for all methods (Tables 1, 2, 4), and Figure 5 shows cumulative cost vs. success rate for the Reflexion integration."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether InterCode SQL/Bash and ALFWorld actually measure the capabilities claimed. The paper assumes benchmark performance equals 'task-solving' ability."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "StateFlow changes both the scaffolding structure and the prompts simultaneously. The ReAct Refined baseline partially isolates the prompt effect but the scaffold vs. prompt contribution is not disentangled."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The benchmarks (Spider 2018, NL2Bash, ALFWorld 2020) predate GPT-3.5/GPT-4 training. No discussion of whether models saw benchmark solutions during training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information (e.g., the interactive feedback from environments could compensate for leakage effects)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of potential overlap between training data and benchmark tasks."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "StateFlow achieves 13% higher success rate than ReAct on InterCode SQL with GPT-3.5, with 5x less cost.",
    365       "evidence": "Table 1: StateFlow 63.73% SR vs ReAct 50.68% SR; cost $3.82 vs $17.7.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "StateFlow achieves 28% higher success rate than ReAct on ALFWorld with GPT-3.5.",
    370       "evidence": "Table 4: StateFlow 83.3% vs ReAct 55.5% overall success rate, with 2.5x less cost ($2.6 vs $6.6).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "StateFlow combined with Reflexion improves ALFWorld success rate from 84.3% to 94.8% after 6 iterations.",
    375       "evidence": "Figure 5 shows the progression. ReAct+Reflexion reaches only 74.6% at 3x higher cost.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The Observe state is the most important component in the SQL StateFlow model.",
    380       "evidence": "Table 3 ablation: removing Observe drops SR from 63.73% to 57.83% (largest drop), also increases cost and error rate.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Further decomposition of states (7→10) improves ALFWorld performance to 88.8% with 15% less cost.",
    385       "evidence": "Table 10 in Appendix B.3 compares 7-state (83.3%) vs 10-state (88.8%) StateFlow.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Conflict of interest not disclosed",
    392       "detail": "Co-author Chi Wang is from Microsoft Research, and the implementation is built on AutoGen (a Microsoft project). The paper effectively promotes the AutoGen framework without disclosing this as a potential conflict."
    393     },
    394     {
    395       "flag": "No statistical significance testing",
    396       "detail": "All comparative claims ('outperforms', 'improves') are made by comparing point estimates without any significance tests. With only 3 runs on ALFWorld and single runs on InterCode, the observed differences may not be statistically significant."
    397     },
    398     {
    399       "flag": "Benchmark contamination risk",
    400       "detail": "Spider (2018), NL2Bash, and ALFWorld (2020) all predate GPT-3.5/GPT-4 training. The models may have memorized solutions. This is especially concerning because StateFlow's structured workflow could exploit memorized patterns differently than ReAct."
    401     },
    402     {
    403       "flag": "No limitations section",
    404       "detail": "The paper has no dedicated limitations or threats-to-validity section. The manual effort required to design state machines for each task type is acknowledged only briefly in the conclusion."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "React: Synergizing reasoning and acting in language models",
    410       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"],
    411       "year": 2022,
    412       "arxiv_id": "2210.03629",
    413       "relevance": "Primary baseline for LLM task-solving with interleaved reasoning and acting."
    414     },
    415     {
    416       "title": "Reflexion: an autonomous agent with dynamic memory and self-reflection",
    417       "authors": ["Noah Shinn", "Beck Labash", "Ashwin Gopinath"],
    418       "year": 2023,
    419       "arxiv_id": "2303.11366",
    420       "relevance": "Iterative self-improvement method combined with StateFlow to demonstrate composability."
    421     },
    422     {
    423       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation framework",
    424       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang", "Yiran Wu"],
    425       "year": 2023,
    426       "arxiv_id": "2308.08155",
    427       "relevance": "Multi-agent framework used as the implementation foundation for StateFlow."
    428     },
    429     {
    430       "title": "InterCode: Standardizing and benchmarking interactive coding with execution feedback",
    431       "authors": ["John Yang", "Akshara Prabhakar", "Karthik Narasimhan", "Shunyu Yao"],
    432       "year": 2023,
    433       "arxiv_id": "2306.14898",
    434       "relevance": "Primary benchmark for evaluating interactive code generation with environment feedback."
    435     },
    436     {
    437       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    438       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao", "Izhak Shafran", "Thomas L Griffiths", "Yuan Cao", "Karthik Narasimhan"],
    439       "year": 2023,
    440       "arxiv_id": "2305.10601",
    441       "relevance": "Alternative LLM reasoning framework modeling thought processes as trees."
    442     },
    443     {
    444       "title": "Graph of thoughts: Solving elaborate problems with large language models",
    445       "authors": ["Maciej Besta"],
    446       "year": 2023,
    447       "arxiv_id": "2308.09687",
    448       "relevance": "Alternative framework modeling LLM reasoning as directed graphs with transformations."
    449     },
    450     {
    451       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    452       "authors": ["Sirui Hong"],
    453       "year": 2023,
    454       "arxiv_id": "2308.00352",
    455       "relevance": "Multi-agent framework for software development using LLMs."
    456     },
    457     {
    458       "title": "CAMEL: Communicative agents for 'mind' exploration of large scale language model society",
    459       "authors": ["Guohao Li"],
    460       "year": 2023,
    461       "relevance": "Framework for autonomous LLM agent cooperation."
    462     },
    463     {
    464       "title": "Self-refine: Iterative refinement with self-feedback",
    465       "authors": ["Aman Madaan"],
    466       "year": 2023,
    467       "arxiv_id": "2303.17651",
    468       "relevance": "Self-improvement method for LLMs through iterative refinement."
    469     },
    470     {
    471       "title": "Language agent tree search unifies reasoning acting and planning in language models",
    472       "authors": ["Andy Zhou"],
    473       "year": 2023,
    474       "arxiv_id": "2310.04406",
    475       "relevance": "LLM tree search method incorporating reflection and environment feedback."
    476     },
    477     {
    478       "title": "ALFWorld: Aligning text and embodied environments for interactive learning",
    479       "authors": ["Mohit Shridhar"],
    480       "year": 2020,
    481       "arxiv_id": "2010.03768",
    482       "relevance": "Text-based embodied environment benchmark used for evaluation."
    483     }
    484   ]
    485 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs