scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29308B)
      1 {
      2   "paper": {
      3     "title": "FlowSteer: Interactive Agentic Workflow Orchestration via End-to-End Reinforcement Learning",
      4     "authors": [
      5       "Mingda Zhang",
      6       "Haoran Luo",
      7       "Tiesunlong Shen",
      8       "Qika Lin",
      9       "Xiaoying Tang",
     10       "Rui Mao",
     11       "Erik Cambria"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.01664"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "FlowSteer proposes an end-to-end RL framework (CWRPO) that trains a lightweight policy model (Qwen3-8B) to orchestrate workflows on an executable canvas, using GPT-4o-mini as the backend. On 12 benchmarks across math, QA, and code generation, FlowSteer outperforms direct LLM baselines, SFT/GRPO, search-based (AFlow), and agent+RL methods, with the largest gains on MATH (+20.31), SQuAD v2 EM (+30.46), and MBPP (+19.54). The framework transfers across six LLM backends and generalizes to OOD benchmarks without task-specific fine-tuning.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract states 'Our code is available at https://github.com/beita6969/FlowSteer' and the paper header includes links for Homepage, Demo, GitHub, Model, and Dataset."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "All 12 evaluation benchmarks (GSM8K, MATH, HotPotQA, SQuAD v2, MBPP, HumanEval, TriviaQA, NaturalQuestions, MathQA, AIME 2025, APPS, DS-1000) are publicly available standard datasets. The paper header also includes a 'Dataset' link."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Table 11 lists hardware (NVIDIA A100 80GB × 2, CUDA 12.5) and framework details (vLLM, bfloat16), but no requirements.txt, Dockerfile, or library version listing is provided in the paper. The information is insufficient to recreate the full software environment."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper provides detailed hyperparameters (Table 11) and algorithmic pseudocode (Algorithm 1), but no step-by-step reproduction instructions, README commands, or scripts to replicate experiments are described in the paper itself."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Tables 3 and 4 report ± values for all baselines and methods (e.g., '91.41±0.4', '92.97±0.6'), indicating variance across three independent runs."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims FlowSteer 'significantly outperforms baselines' but reports no statistical significance tests (no p-values, t-tests, or any formal test). The claim of significance is informal, based solely on comparing point estimates."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Tables 3 and 4 include ∆↑ columns showing absolute improvements over the best baseline (e.g., +20.31 on MATH, +30.46 on SQuAD v2 EM). The baseline values are visible in the same tables, providing full context."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Appendix D states '128 instances were randomly sampled from each of the six out-of-distribution and six trained datasets for testing, except for AIME 2025 which contains 30 problems.' No justification for why 128 was chosen, no power analysis."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Appendix E states 'we conducted three independent runs under identical settings for both Flow-Steer and all baselines and reported the averaged results.' Tables 3 and 4 show ± values representing standard deviation across runs."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Extensive baselines across four categories: direct LLM (Qwen3-8B, GPT-4o-mini), fine-tuning (SFT, GRPO), search-based (AFlow), and agent+RL (AgentFlow, Router-R1, Orchestrator). See Tables 3-4 and Appendix E."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include Router-R1 (Zhang et al., 2025), Orchestrator (Su et al., 2025), and AFlow (Zhang et al., 2024), all recent work. Table 9 provides architectural comparison."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table 5 presents a comprehensive ablation study across all 12 datasets: w/o Agent, w/o Multi-turn, w/o Canvas, w/o RL. Each component removal is tested independently."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple task-appropriate metrics: EM and F1 for QA tasks, Accuracy for math reasoning, Pass@1 for code generation. Evaluation metrics are detailed in Appendix F."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation is included. All evaluation is automated using standard benchmark metrics. No human assessment of workflow quality, answer quality, or system usability."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Training uses 10,778 instances from IID datasets (Appendix D), while testing uses separately sampled 128 instances per dataset. Additionally, 6 OOD benchmarks (not used in training) provide held-out evaluation."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Tables 3 and 4 provide per-dataset results across all 12 benchmarks. Figure 4(b) shows aggregated performance grouped by task type (math, QA, code). Ablation results (Table 5) also show per-dataset breakdowns."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "Section I (Limitations) discusses general failure modes (error propagation, context saturation) but shows no specific failure examples. The four case studies in Appendix H all show successful workflows. No error analysis of where FlowSteer produces incorrect answers."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "Every experiment shows FlowSteer outperforming baselines. The ablation study shows component removal hurts performance (expected), but no approaches tried and abandoned or configurations that failed are reported."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims: 'significantly outperforms baselines across various tasks' — supported by Tables 3-4; 'plug-and-play framework' — supported by Figure 4 backend transferability; 'reduces token consumption' — supported by Figure 5(a-b)."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims about component contributions ('RL most affected by complex reasoning tasks') are supported by controlled ablation study (Table 5) with single-variable removal. The ablation design is adequate for these claims."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Section 1 claims FlowSteer has 'broad adaptability and strong practical potential.' Appendix K claims applicability in 'law, healthcare, and finance' with no evidence from these domains. The paper tests only QA, math, and code generation. The title 'Interactive Agentic Workflow Orchestration' implies generality beyond the tested tasks."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No discussion of alternative explanations for results. The paper doesn't consider whether improvements come from additional compute overhead, whether baseline implementations are suboptimal, or whether the operator library design (rather than RL) drives most gains."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures standard benchmark metrics (EM, F1, Accuracy, Pass@1) and claims improved task performance on those specific metrics. The claims match the measurement granularity — no broader framing gap exists."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper uses 'GPT-4o-mini' and 'Qwen3-8B' without specific version numbers or snapshot dates. GPT-4o-mini is a marketing name; the criterion requires specific API versions. Similarly, 'GPT-OSS-120B', 'GPT-5.2', 'Grok-4.1-Fast' etc. lack precise versioning."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Figure 6 provides the complete system prompt template for Flow-Director. Table 2 shows the prompt template structure. The case studies in Appendix H show actual prompts used during interactions."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Table 11 provides comprehensive hyperparameters: temperature (0.6), top-p (0.95), top-k (20), learning rate (1e-5), LoRA rank (64), clip range (0.20), KL coefficient (0.005), batch size (36), max interaction rounds (20), etc."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The Workflow Canvas environment is described in detail: operator library (Table 1, Table 7), action space (Table 8), state machine (Appendix A.3.1), multi-turn interaction process (Section 4.2), feedback mechanisms (Eq. 7), and complete interaction examples (Figure 7, Appendix H)."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Appendix D documents training set construction: 2560 from GSM8K, 2560 from MATH, 2560 from HotPotQA, 2560 from SQuAD v2, 374 from MBPP (full), 164 from HumanEval (full), totaling 10,778 instances. Test set: 128 randomly sampled per dataset."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section I 'Limitations' provides substantive discussion of two key issues: error propagation from early-stage decisions and context window saturation affecting approximately 8% of complex tasks."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section I identifies specific threats: 'Even subtle errors introduced at early stages... may accumulate rapidly via error propagation through subsequent operators' and 'the context window becomes saturated (with our 16,384 token limit affecting approximately 8% of complex tasks).'"
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show. No explicit boundaries on what tasks, domains, or settings are excluded. Section I discusses failure modes but not scope limitations. Appendix K even extends claims to untested domains (law, healthcare, finance)."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "While code and benchmarks are available, raw experimental data (model outputs, generated trajectories, per-example results) are not mentioned as being released. Only aggregated results in tables are provided."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Appendix D describes all 12 datasets with their sources, characteristics, and how training/test splits were constructed. Training set composition is documented with exact counts per dataset."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. All data comes from standard public benchmarks."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The training pipeline is documented: data mixing strategy (Appendix D), training procedure with three stages (Algorithm 1), evaluation protocol with specific sample counts (128 per dataset, 3 runs). Appendix G provides additional implementation details."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgments, grants, or sponsorship information found anywhere in the paper."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly stated: CUHK Shenzhen, Nanyang Technological University, National University of Singapore. Authors are academic researchers, not evaluating products from their own companies."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding source is disclosed, making it impossible to assess funder independence. Absence of disclosure is not evidence of absence of conflict."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial disclosure statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper uses GPT-4o-mini and Qwen3-8B as primary models but does not state the training data cutoff date for either model."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether benchmark examples (HumanEval published 2021, GSM8K published 2021, etc.) appeared in the training data of GPT-4o-mini or Qwen3-8B."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "Multiple benchmarks (HumanEval, GSM8K, MATH, HotPotQA, SQuAD v2, MBPP) were published years before the models' training cutoffs. No contamination analysis or discussion is provided."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. All evaluation is automated on standard benchmarks."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Figure 5(a) shows token consumption comparison across task types (math, QA, code). Figure 5(b) shows average interaction turns. Section 5.5 discusses that FlowSteer achieves lower turns and fewer tokens across all task types."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Table 11 states GPU hardware (NVIDIA A100 80GB × 2) and Appendix G states 'The total training time for 300 steps is approximately 8 hours.' CUDA version (12.5) and precision (bfloat16) are also specified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": true,
    303         "justification": "Appendix E states 'we conducted three independent runs under identical settings for both Flow-Steer and all baselines and reported the averaged results.' Tables 3-4 show ± values representing variation across runs."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Appendix E explicitly states 'three independent runs' for all methods."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Table 11 lists detailed hyperparameter values but provides no information about how they were selected — no search budget, no number of configurations tried, no search method described."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No explanation of how the final hyperparameter configuration (e.g., LoRA rank 64, learning rate 1e-5, clip range 0.20) was selected. Only the final values are reported."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes many comparisons (8 methods × 12 datasets × multiple metrics) without any correction for multiple comparisons (no Bonferroni, Holm, or similar corrections)."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement their own versions of baselines (AgentFlow, Router-R1, Orchestrator per Table 10). No acknowledgment of author evaluation bias per Lucic et al. (2018), no independent evaluation."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Figure 5(a) shows token consumption but does not plot performance as a function of compute budget. No analysis of whether FlowSteer's gains justify its computational overhead compared to simpler approaches."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper uses standard benchmarks without any discussion of whether they actually measure the claimed capabilities. No analysis of construct validity for any benchmark."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "All workflow methods use the same backend LLM (GPT-4o-mini) for execution, controlling for model capability. The different orchestration approaches (scaffold) are the experimental variable being compared. Table 3 header confirms 'Agent+RL (4o-mini)' and 'Ours (4o-mini)'."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of temporal leakage. Many benchmarks (HumanEval 2021, GSM8K 2021, MATH 2021) were published years before the models were trained, meaning solutions may exist in training data."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information. The workflow canvas provides execution feedback during orchestration, but there is no analysis of whether this creates an unfair advantage versus baselines."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether training and test examples from the same benchmarks share structural similarities or come from overlapping distributions."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is used — no canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "FlowSteer significantly outperforms baselines across six IID benchmarks, with largest improvements on math reasoning (MATH +20.31, GSM8K +3.12) and QA (SQuAD v2 EM +30.46, HotPotQA EM +14.84).",
    372       "evidence": "Table 3 shows FlowSteer outperforms all baselines on every IID benchmark. Three independent runs with ± values reported. GPT-4o-mini backend used for all workflow methods.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "FlowSteer maintains superiority on six OOD benchmarks without task-specific fine-tuning, demonstrating cross-task generalization.",
    377       "evidence": "Table 4 shows OOD results. FlowSteer leads on all metrics, with gains like NaturalQuestions EM +14.85 and DS-1000 Pass@1 +13.28. However, AIME 2025 has only 30 problems with 26.67% accuracy.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "FlowSteer transfers across six different LLM backends (DeepSeek-V3.2, Grok-4.1-Fast, GPT-5.2, Claude-Opus-4.5, Gemini-3-Flash, Qwen-Plus) with consistent improvements.",
    382       "evidence": "Figure 4(a) radar charts and Figure 4(b) aggregated results show consistent gains across backends. Weaker baselines benefit more. Figure 4(c) shows training dynamics converge for both 4o-mini and OSS-120B backends.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "All four components (agent, multi-turn interaction, canvas feedback, RL) are necessary for FlowSteer's performance.",
    387       "evidence": "Table 5 ablation study across all 12 datasets. Removing any component reduces performance, with RL having the greatest impact on complex tasks (MATH, code generation). Full FlowSteer is best on every dataset.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "CWRPO outperforms GRPO and DAPO on all IID benchmarks with more stable training dynamics.",
    392       "evidence": "Table 6 shows CWRPO leads on all six IID benchmarks. Figure 5(c-e) pairwise comparison matrices show consistent CWRPO superiority across math, QA, and code tasks.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "FlowSteer achieves lower token consumption and fewer interaction turns compared to ablation variants.",
    397       "evidence": "Figure 5(a-b) shows FlowSteer uses fewer tokens and turns across all task types. Removing any component increases both resource usage and interaction count.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No significance tests despite 'significant' claims",
    404       "detail": "The paper repeatedly claims FlowSteer 'significantly outperforms' baselines but uses no statistical significance tests. All claims of superiority are based on comparing point estimates. With only 128 test samples and 3 runs, the ± values overlap in several cells of Tables 3-4, making it unclear whether differences are statistically significant."
    405     },
    406     {
    407       "flag": "Complete absence of contamination discussion",
    408       "detail": "Multiple benchmarks (HumanEval, GSM8K, MATH, MBPP, HotPotQA, SQuAD v2) were published 3-5 years before the models' likely training cutoffs. The paper makes no attempt to discuss or address benchmark contamination, a critical concern for benchmark-eval papers."
    409     },
    410     {
    411       "flag": "Author-implemented baselines without independent verification",
    412       "detail": "Table 10 shows the authors implemented their own versions of AgentFlow, Router-R1, and Orchestrator. No independent verification or use of official implementations is mentioned. Per Lucic et al. (2018), author implementations of baselines systematically underperform."
    413     },
    414     {
    415       "flag": "Extremely small sample for AIME claims",
    416       "detail": "AIME 2025 contains only 30 problems. FlowSteer achieves 26.67% (≈8 correct) vs baselines at 10-20% (≈3-6 correct). The difference of 2-5 additional correct answers is presented as a meaningful improvement (+16.67pp) but is not statistically reliable at this sample size."
    417     },
    418     {
    419       "flag": "Unbounded generalization claims",
    420       "detail": "Appendix K claims applicability in 'law, healthcare, and finance' with no evidence. Section 1 claims 'broad adaptability and strong practical potential.' These claims go far beyond the three task types tested (math, QA, code generation)."
    421     },
    422     {
    423       "flag": "Every experiment shows FlowSteer winning",
    424       "detail": "Across all 12 datasets, all metrics, all backends, all ablations, and all RL algorithm comparisons, FlowSteer (Full) is the best method in every single comparison. No negative results or situations where FlowSteer underperforms are reported, which strains credulity."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "AFlow: Automating Agentic Workflow Generation",
    430       "authors": ["Jiayi Zhang"],
    431       "year": 2024,
    432       "arxiv_id": "2410.10762",
    433       "relevance": "Directly comparable workflow orchestration method using Monte Carlo Tree Search, serves as baseline."
    434     },
    435     {
    436       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    437       "authors": ["DeepSeek-AI"],
    438       "year": 2025,
    439       "arxiv_id": "2501.12948",
    440       "relevance": "Foundational work on GRPO-style RL objectives for LLM reasoning, directly motivates CWRPO design."
    441     },
    442     {
    443       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    444       "authors": ["Shunyu Yao"],
    445       "year": 2023,
    446       "relevance": "Foundational agent paradigm (reason+act) that FlowSteer's Flow-Director builds upon."
    447     },
    448     {
    449       "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models",
    450       "authors": ["Andy Zhou"],
    451       "year": 2024,
    452       "relevance": "Agent RL method combining search and planning, representative of the automated workflow optimization paradigm."
    453     },
    454     {
    455       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    456       "authors": ["Sirui Hong"],
    457       "year": 2024,
    458       "relevance": "Multi-agent workflow framework using SOPs/roles, representative of the orchestration paradigm FlowSteer aims to automate."
    459     },
    460     {
    461       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    462       "authors": ["Qingyun Wu"],
    463       "year": 2023,
    464       "arxiv_id": "2308.08155",
    465       "relevance": "Multi-agent conversation framework for cross-team orchestration, key prior work in agentic workflow design."
    466     },
    467     {
    468       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    469       "authors": ["Isaac Ong"],
    470       "year": 2024,
    471       "arxiv_id": "2406.18665",
    472       "relevance": "LLM routing and orchestration approach addressing operator/backend selection, directly relevant to workflow orchestration."
    473     },
    474     {
    475       "title": "Evaluating Large Language Models Trained on Code",
    476       "authors": ["Mark Chen"],
    477       "year": 2021,
    478       "arxiv_id": "2107.03374",
    479       "relevance": "Introduces HumanEval benchmark used as primary code generation evaluation in this paper."
    480     },
    481     {
    482       "title": "ReTool: Reinforcement Learning for Strategic Tool Use in LLMs",
    483       "authors": ["Jiazhan Feng"],
    484       "year": 2025,
    485       "arxiv_id": "2504.11536",
    486       "relevance": "RL-based tool use framework using step-grained shaping and outcome feedback, directly comparable agent RL approach."
    487     },
    488     {
    489       "title": "DAPO: An Open-Source LLM Reinforcement Learning System at Scale",
    490       "authors": ["Qiying Yu"],
    491       "year": 2025,
    492       "arxiv_id": "2503.14476",
    493       "relevance": "Open-source RL algorithm used as baseline comparison (Table 6) for training dynamics and performance."
    494     },
    495     {
    496       "title": "Router-R1: Teaching LLMs Multi-Round Routing and Aggregation via Reinforcement Learning",
    497       "authors": ["Hao Zhang"],
    498       "year": 2025,
    499       "arxiv_id": "2506.09033",
    500       "relevance": "Router-style RL agent for multi-round query routing, serves as direct baseline comparison."
    501     },
    502     {
    503       "title": "ChatDev: Communicative Agents for Software Development",
    504       "authors": ["Chen Qian"],
    505       "year": 2024,
    506       "relevance": "Multi-agent software development framework demonstrating workflow-based code generation."
    507     }
    508   ]
    509 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs