ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (24689B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "FlowSteer: Interactive Agentic Workflow Orchestration via End-to-End Reinforcement Learning",
      6     "authors": [
      7       "Mingda Zhang",
      8       "Haoran Luo",
      9       "Tiesunlong Shen",
     10       "Qika Lin",
     11       "Xiaoying Tang"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.01664",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims FlowSteer 'significantly outperforms baselines across various tasks' and supports plug-and-play deployment; Tables 3–4 show consistent improvements, and Figure 4 confirms cross-backend transferability.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about each component's contribution are backed by Table 5's ablation study removing Agent, Multi-turn, Canvas, and RL individually, providing adequate evidence for causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Appendix K claims applicability to 'law, healthcare, and finance' without any testing in those domains; the Conclusion claims 'broad adaptability' beyond the tested QA, math, and code benchmarks.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for performance gains, such as increased computation per query (more LLM calls), or that Qwen3-8B may have memorized benchmark answers during pretraining.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper uses task-appropriate metrics (EM/F1 for QA, accuracy for math, Pass@1 for code) and does not conflate proxy measures with broader capabilities; it measures what it claims.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Appendix I provides a dedicated Limitations section discussing error propagation through operators and context window saturation affecting approximately 8% of complex tasks.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The limitations section gives specific quantitative examples: '~8% of complex tasks' hit the 16,384 token context limit, and early-stage operator errors are identified as a specific propagation risk.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper makes broad applicability claims in Appendix K (law, healthcare, finance) without stating that results only apply to the tested benchmark tasks; no explicit scope boundaries are drawn.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper; the Impact Statement only notes no specific societal consequences require highlighting.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations with CUHK-Shenzhen, Nanyang Technological University, and National University of Singapore are clearly disclosed on the title page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding source is disclosed, so independence of funder cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial disclosure is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 3 formally defines 'Workflow Graph' (Definition 1) and 'Orchestration Trajectory' (Definition 2); the canvas environment, operator library, and action space are precisely specified.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states its contribution: an end-to-end RL framework (FlowSteer) with a novel training algorithm (CWRPO) for automated workflow orchestration via multi-turn agent-canvas interaction.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 and Figure 2 situate FlowSteer relative to three prior paradigms (static selection, offline generation, automated optimization), directly comparing against AFlow, GRPO, and agent-RL methods.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract directly provides a GitHub URL: https://github.com/beita6969/FlowSteer, not 'available upon request.'",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All 12 evaluation benchmarks are standard publicly available datasets (GSM8K, MATH, HotPotQA, SQuAD v2, MBPP, HumanEval, TriviaQA, NQ, MathQA, AIME 2025, APPS, DS-1000).",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Table 11 specifies hardware (A100 80GB × 2, CUDA 12.5) and precision (bfloat16) but no requirements.txt, Dockerfile, or Python environment specification is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper provides Algorithm 1 and Table 11 hyperparameters but no step-by-step reproduction instructions; the reader must infer setup from scattered appendix details.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Tables 3 and 4 report ± standard deviation for all methods across all benchmarks (e.g., '91.41±0.4'), based on three independent runs.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests (t-tests, permutation tests, etc.) are reported despite numerous comparative claims across baselines.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Tables 3 and 4 explicitly report delta improvements (e.g., '+3.12', '+20.31', '+14.84') relative to the best baseline, providing effect sizes with baseline context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Appendix D states 128 test samples per dataset (30 for AIME 2025) but provides no power analysis or justification for why these sample sizes are sufficient.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "All results in Tables 3, 4, 5, and 6 include ±standard deviation values computed over three independent runs.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Seven distinct baselines are included across four categories: direct LLM (Qwen3-8B, GPT-4o-mini), fine-tuning (SFT, GRPO), search-based (AFlow), and agent-RL (AgentFlow, Router-R1, Orchestrator).",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include Router-R1 (2025), Orchestrator (2025), DAPO (2025), and AFlow (2024), which are contemporary and competitive methods in the workflow/agent-RL space.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table 5 presents a full ablation across all 12 benchmarks removing: the agent (w/o Agent), multi-turn interaction (w/o Multi-turn), canvas feedback (w/o Canvas), and RL training (w/o RL).",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The evaluation uses four task-appropriate metrics: EM and F1 for QA, Accuracy for math reasoning, and Pass@1 for code generation.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "The paper evaluates on automated benchmarks with programmatic ground-truth checking; human evaluation is not relevant for this type of system.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "128 held-out test samples per IID dataset are used for evaluation, and 6 separate OOD benchmarks are kept entirely out of training to assess generalization.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by individual benchmark and grouped by task category (math, QA, code) in Tables 3–5 and Figure 4(b).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Appendix I discusses error propagation from early operators and context window saturation (~8% of complex tasks); Case Study 3 illustrates iterative failure and repair in code generation.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "No experiments where FlowSteer failed to improve or performed worse are reported; all comparisons show FlowSteer as the best-performing method.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Transferability experiments use informal names like 'GPT-5.2', 'Grok-4.1-Fast', 'Claude-Opus-4.5' without API version snapshots or release dates; even GPT-4o-mini lacks a snapshot date.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 6 and Table 2 reproduce the complete system prompt template used by Flow-Director, including all instructions and action format requirements.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 11 provides comprehensive hyperparameters covering model config (LoRA rank/alpha, dropout), training (LR=1e-5, batch=36, steps=300), CWRPO (clip=0.20, KL=0.005), and generation (temperature=0.6, top-p=0.95).",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The workflow canvas, 12-operator library, 8-action-type space, state machine (BUILDING/AWAITING_PROMPT), and multi-turn interaction loop are described in detail in Sections 4.1–4.2 and Appendix A.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix D documents the training data construction: specific sample counts per dataset (e.g., 2,560 from GSM8K, 164 from HumanEval), yielding 10,778 total training instances.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The specific 128-sample test splits used for evaluation are not released; only the source benchmark datasets are public, but the paper's specific subsets cannot be independently verified.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Appendix D describes how test samples were collected: '128 instances were randomly sampled from each of the six OOD and six IID datasets for testing, except AIME 2025 with 30 problems.'",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "Standard public benchmarks are used; no participant recruitment applies.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Appendix D documents the full pipeline from benchmark selection to training mix construction, with specific per-source counts and the evaluation sampling strategy.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Neither the training data cutoff for Qwen3-8B (policy model) nor GPT-4o-mini (backend) is stated, despite both models having been trained on data that likely includes the evaluation benchmarks.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper does not discuss whether Qwen3-8B's pretraining data includes GSM8K, MATH, HotPotQA, or other evaluation benchmarks, a significant omission for RL fine-tuning on these tasks.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "All 12 benchmarks predate the model training cutoffs; potential memorization of benchmark answers by the base model is never discussed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Figure 5(a) shows token consumption comparison across task types for FlowSteer vs. ablation variants, demonstrating that FlowSteer achieves lower token usage.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Appendix G states 'The total training time for 300 steps is approximately 8 hours' on 'two NVIDIA A100 80GB GPUs with CUDA 12.5.'",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "FlowSteer significantly outperforms all baseline categories on all six IID benchmarks (avg +13.28pp Acc/Pass, +18.10pp F1, +22.65pp EM).",
    375       "evidence": "Table 3 shows consistent improvements across GSM8K, MATH, HotPotQA, SQuAD v2, MBPP, HumanEval with ± values from 3 runs.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "FlowSteer generalizes to OOD benchmarks without task-specific fine-tuning, outperforming all baselines on six OOD datasets.",
    380       "evidence": "Table 4 shows improvements on TriviaQA, NaturalQuestions, MathQA, AIME 2025, APPS, DS-1000 with consistent gains.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "CWRPO outperforms GRPO and DAPO on all six IID benchmarks under identical training settings.",
    385       "evidence": "Table 6 shows CWRPO achieving 96.09/81.25/78.12/83.67/84.38/92.96 vs. DAPO at 93.75/74.22/73.44/82.42/81.25/89.06.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Every component (agent, multi-turn interaction, canvas, RL) contributes significantly to performance.",
    390       "evidence": "Table 5 ablation shows removing any component degrades performance across all 12 benchmarks, with RL removal most damaging for complex tasks.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Flow-Director transfers across six different LLM backends with consistent improvements.",
    395       "evidence": "Figure 4 radar charts show improvements on all six backends (DeepSeek-V3.2, Grok-4.1-Fast, GPT-5.2, Claude-Opus-4.5, Gemini-3-Flash, Qwen-Plus).",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "FlowSteer achieves lower token consumption and fewer interaction turns than ablation variants.",
    400       "evidence": "Figure 5(a-b) shows FlowSteer uses fewer tokens and turns across all task types compared to variants missing any component.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "theoretical"
    407   ],
    408   "key_findings": "FlowSteer proposes an end-to-end RL framework where a lightweight Qwen3-8B policy model learns to orchestrate executable workflow graphs by interacting with a canvas environment, outperforming baselines by 13–22pp across 12 benchmarks in math, QA, and code generation. The CWRPO algorithm uses diversity-constrained rewards with conditional release to prevent shortcut behaviors, demonstrating that structural diversity must be learned before answer quality can be optimized. Ablation confirms all three components—multi-turn interaction, canvas feedback, and RL—are individually necessary. The Flow-Director transfers across six different LLM backends without retraining, suggesting the learned orchestration policy is largely backend-agnostic.",
    409   "red_flags": [
    410     {
    411       "flag": "Fabricated/future model names in transferability experiments",
    412       "detail": "Figure 4 and Section 5.4 reference backends including 'GPT-5.2', 'Grok-4.1-Fast', 'Claude-Opus-4.5', 'Gemini-3-Flash' — names that appear speculative or informal, with no API snapshot dates provided, making these results unverifiable."
    413     },
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "Despite 50+ comparative claims across 12 benchmarks, no significance tests are reported. The ±std values are shown but never used to assess whether improvements are statistically reliable."
    417     },
    418     {
    419       "flag": "Benchmark contamination unaddressed",
    420       "detail": "Qwen3-8B and GPT-4o-mini are evaluated on benchmarks (GSM8K, MATH, HotPotQA, HumanEval) that almost certainly appeared in their pretraining data; this is not discussed at all."
    421     },
    422     {
    423       "flag": "Untested domain generalization claims",
    424       "detail": "Appendix K claims applicability to law, healthcare, and finance, but no experiments are conducted in these domains; this is pure speculation."
    425     },
    426     {
    427       "flag": "Small test sample size",
    428       "detail": "128 samples per dataset (30 for AIME) is small given benchmark variance; results may not be stable across different random seeds, with no power analysis provided."
    429     },
    430     {
    431       "flag": "Circular theoretical proofs",
    432       "detail": "Appendix B provides formal proofs for all three propositions, but they rely on strong assumptions (Informative Canvas Feedback, Repairability) that essentially assume the conclusions and are not empirically validated."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "AFlow: Automating Agentic Workflow Generation",
    438       "relevance": "Direct baseline and prior work on search-based workflow orchestration using MCTS"
    439     },
    440     {
    441       "title": "DeepSeek-Math: Pushing the Limits of Mathematical Reasoning via GRPO",
    442       "relevance": "Source of the GRPO algorithm that CWRPO builds upon"
    443     },
    444     {
    445       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    446       "relevance": "Key prior work on RL for LLM reasoning that motivates the approach"
    447     },
    448     {
    449       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    450       "relevance": "Foundational agent paradigm that FlowSteer's Flow-Director is based on"
    451     },
    452     {
    453       "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning (LATS)",
    454       "relevance": "Related work on combining search and execution feedback for workflow optimization"
    455     },
    456     {
    457       "title": "Language Agents as Optimizable Graphs (GPTSwarm)",
    458       "relevance": "Prior work on graph-structured workflow optimization"
    459     },
    460     {
    461       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    462       "relevance": "Representative multi-agent workflow system that FlowSteer compares against"
    463     },
    464     {
    465       "title": "DAPO: An Open-Source LLM Reinforcement Learning System at Scale",
    466       "relevance": "Direct baseline RL algorithm compared against CWRPO in Table 6"
    467     },
    468     {
    469       "title": "Agent Workflow Memory",
    470       "relevance": "Related work on workflow memory and reuse for agentic systems"
    471     },
    472     {
    473       "title": "ArCHer: Training Language Model Agents via Hierarchical Multi-turn RL",
    474       "relevance": "Related hierarchical multi-turn RL approach for agents"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 2,
    480       "justification": "The framework addresses a real bottleneck (manual workflow construction) and provides code + demo, but requires non-trivial RL infrastructure to replicate."
    481     },
    482     "surprise_contrarian": {
    483       "score": 1,
    484       "justification": "Applying RL to learn workflow orchestration is a natural extension of current trends; no counterintuitive findings are presented."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No AI risk concerns are raised; the Impact Statement explicitly states no societal consequences need highlighting."
    489     },
    490     "drama_conflict": {
    491       "score": 0,
    492       "justification": "Standard system paper with no controversy or adversarial framing."
    493     },
    494     "demo_ability": {
    495       "score": 2,
    496       "justification": "A demo link and GitHub repo are provided in the abstract, enabling hands-on exploration of the system."
    497     },
    498     "brand_recognition": {
    499       "score": 1,
    500       "justification": "Authors are from CUHK-Shenzhen, NTU, and NUS — reputable institutions but not major AI labs like DeepMind, OpenAI, or Meta."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [],
    505     "top_points": 0,
    506     "total_points": 0,
    507     "total_comments": 0
    508   }
    509 }

Impressum · Datenschutz