scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31143B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "FlowSteer: Interactive Agentic Workflow Orchestration via End-to-End Reinforcement Learning",
      6     "authors": [
      7       "Mingda Zhang",
      8       "Haoran Luo",
      9       "Tiesunlong Shen",
     10       "Qika Lin",
     11       "Xiaoying Tang"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.01664",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims: 'significantly outperforms baselines across various tasks' — supported by Tables 3-4; 'plug-and-play framework' — supported by Figure 4 backend transferability; 'reduces token consumption' — supported by Figure 5(a-b).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about component contributions ('RL most affected by complex reasoning tasks') are supported by controlled ablation study (Table 5) with single-variable removal. The ablation design is adequate for these claims.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Section 1 claims FlowSteer has 'broad adaptability and strong practical potential.' Appendix K claims applicability in 'law, healthcare, and finance' with no evidence from these domains. The paper tests only QA, math, and code generation. The title 'Interactive Agentic Workflow Orchestration' implies generality beyond the tested tasks.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No discussion of alternative explanations for results. The paper doesn't consider whether improvements come from additional compute overhead, whether baseline implementations are suboptimal, or whether the operator library design (rather than RL) drives most gains.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures standard benchmark metrics (EM, F1, Accuracy, Pass@1) and claims improved task performance on those specific metrics. The claims match the measurement granularity — no broader framing gap exists.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section I 'Limitations' provides substantive discussion of two key issues: error propagation from early-stage decisions and context window saturation affecting approximately 8% of complex tasks.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section I identifies specific threats: 'Even subtle errors introduced at early stages... may accumulate rapidly via error propagation through subsequent operators' and 'the context window becomes saturated (with our 16,384 token limit affecting approximately 8% of complex tasks).'",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show. No explicit boundaries on what tasks, domains, or settings are excluded. Section I discusses failure modes but not scope limitations. Appendix K even extends claims to untested domains (law, healthcare, finance).",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgments, grants, or sponsorship information found anywhere in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated: CUHK Shenzhen, Nanyang Technological University, National University of Singapore. Authors are academic researchers, not evaluating products from their own companies.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding source is disclosed, making it impossible to assess funder independence. Absence of disclosure is not evidence of absence of conflict.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms including 'Workflow Graph' (Definition 1), 'Orchestration Trajectory' (Definition 2), and all operators in the library are formally defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contribution is explicitly stated: an end-to-end RL framework (FlowSteer) with a novel optimization algorithm (CWRPO) for automated workflow orchestration via multi-turn canvas interaction.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 and the introduction compare FlowSteer against three named paradigms (static selection, offline generation, automated optimization) with specific references and limitations of each.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract states 'Our code is available at https://github.com/beita6969/FlowSteer' and the paper header includes links for Homepage, Demo, GitHub, Model, and Dataset.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All 12 evaluation benchmarks (GSM8K, MATH, HotPotQA, SQuAD v2, MBPP, HumanEval, TriviaQA, NaturalQuestions, MathQA, AIME 2025, APPS, DS-1000) are publicly available standard datasets. The paper header also includes a 'Dataset' link.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Table 11 lists hardware (NVIDIA A100 80GB × 2, CUDA 12.5) and framework details (vLLM, bfloat16), but no requirements.txt, Dockerfile, or library version listing is provided in the paper. The information is insufficient to recreate the full software environment.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper provides detailed hyperparameters (Table 11) and algorithmic pseudocode (Algorithm 1), but no step-by-step reproduction instructions, README commands, or scripts to replicate experiments are described in the paper itself.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Tables 3 and 4 report ± values for all baselines and methods (e.g., '91.41±0.4', '92.97±0.6'), indicating variance across three independent runs.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims FlowSteer 'significantly outperforms baselines' but reports no statistical significance tests (no p-values, t-tests, or any formal test). The claim of significance is informal, based solely on comparing point estimates.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Tables 3 and 4 include ∆↑ columns showing absolute improvements over the best baseline (e.g., +20.31 on MATH, +30.46 on SQuAD v2 EM). The baseline values are visible in the same tables, providing full context.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Appendix D states '128 instances were randomly sampled from each of the six out-of-distribution and six trained datasets for testing, except for AIME 2025 which contains 30 problems.' No justification for why 128 was chosen, no power analysis.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Appendix E states 'we conducted three independent runs under identical settings for both Flow-Steer and all baselines and reported the averaged results.' Tables 3 and 4 show ± values representing standard deviation across runs.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Extensive baselines across four categories: direct LLM (Qwen3-8B, GPT-4o-mini), fine-tuning (SFT, GRPO), search-based (AFlow), and agent+RL (AgentFlow, Router-R1, Orchestrator). See Tables 3-4 and Appendix E.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include Router-R1 (Zhang et al., 2025), Orchestrator (Su et al., 2025), and AFlow (Zhang et al., 2024), all recent work. Table 9 provides architectural comparison.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table 5 presents a comprehensive ablation study across all 12 datasets: w/o Agent, w/o Multi-turn, w/o Canvas, w/o RL. Each component removal is tested independently.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple task-appropriate metrics: EM and F1 for QA tasks, Accuracy for math reasoning, Pass@1 for code generation. Evaluation metrics are detailed in Appendix F.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation is included. All evaluation is automated using standard benchmark metrics. No human assessment of workflow quality, answer quality, or system usability.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Training uses 10,778 instances from IID datasets (Appendix D), while testing uses separately sampled 128 instances per dataset. Additionally, 6 OOD benchmarks (not used in training) provide held-out evaluation.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Tables 3 and 4 provide per-dataset results across all 12 benchmarks. Figure 4(b) shows aggregated performance grouped by task type (math, QA, code). Ablation results (Table 5) also show per-dataset breakdowns.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Section I (Limitations) discusses general failure modes (error propagation, context saturation) but shows no specific failure examples. The four case studies in Appendix H all show successful workflows. No error analysis of where FlowSteer produces incorrect answers.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "Every experiment shows FlowSteer outperforming baselines. The ablation study shows component removal hurts performance (expected), but no approaches tried and abandoned or configurations that failed are reported.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "The paper uses 'GPT-4o-mini' and 'Qwen3-8B' without specific version numbers or snapshot dates. GPT-4o-mini is a marketing name; the criterion requires specific API versions. Similarly, 'GPT-OSS-120B', 'GPT-5.2', 'Grok-4.1-Fast' etc. lack precise versioning.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 6 provides the complete system prompt template for Flow-Director. Table 2 shows the prompt template structure. The case studies in Appendix H show actual prompts used during interactions.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 11 provides comprehensive hyperparameters: temperature (0.6), top-p (0.95), top-k (20), learning rate (1e-5), LoRA rank (64), clip range (0.20), KL coefficient (0.005), batch size (36), max interaction rounds (20), etc.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The Workflow Canvas environment is described in detail: operator library (Table 1, Table 7), action space (Table 8), state machine (Appendix A.3.1), multi-turn interaction process (Section 4.2), feedback mechanisms (Eq. 7), and complete interaction examples (Figure 7, Appendix H).",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix D documents training set construction: 2560 from GSM8K, 2560 from MATH, 2560 from HotPotQA, 2560 from SQuAD v2, 374 from MBPP (full), 164 from HumanEval (full), totaling 10,778 instances. Test set: 128 randomly sampled per dataset.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "While code and benchmarks are available, raw experimental data (model outputs, generated trajectories, per-example results) are not mentioned as being released. Only aggregated results in tables are provided.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Appendix D describes all 12 datasets with their sources, characteristics, and how training/test splits were constructed. Training set composition is documented with exact counts per dataset.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. All data comes from standard public benchmarks.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The training pipeline is documented: data mixing strategy (Appendix D), training procedure with three stages (Algorithm 1), evaluation protocol with specific sample counts (128 per dataset, 3 runs). Appendix G provides additional implementation details.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The paper uses GPT-4o-mini and Qwen3-8B as primary models but does not state the training data cutoff date for either model.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether benchmark examples (HumanEval published 2021, GSM8K published 2021, etc.) appeared in the training data of GPT-4o-mini or Qwen3-8B.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Multiple benchmarks (HumanEval, GSM8K, MATH, HotPotQA, SQuAD v2, MBPP) were published years before the models' training cutoffs. No contamination analysis or discussion is provided.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study. All evaluation is automated on standard benchmarks.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Figure 5(a) shows token consumption comparison across task types (math, QA, code). Figure 5(b) shows average interaction turns. Section 5.5 discusses that FlowSteer achieves lower turns and fewer tokens across all task types.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Table 11 states GPU hardware (NVIDIA A100 80GB × 2) and Appendix G states 'The total training time for 300 steps is approximately 8 hours.' CUDA version (12.5) and precision (bfloat16) are also specified.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": true,
    374           "justification": "Appendix E states 'we conducted three independent runs under identical settings for both Flow-Steer and all baselines and reported the averaged results.' Tables 3-4 show ± values representing variation across runs.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": true,
    380           "justification": "Appendix E explicitly states 'three independent runs' for all methods.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "Table 11 lists detailed hyperparameter values but provides no information about how they were selected — no search budget, no number of configurations tried, no search method described.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "No explanation of how the final hyperparameter configuration (e.g., LoRA rank 64, learning rate 1e-5, clip range 0.20) was selected. Only the final values are reported.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "The paper makes many comparisons (8 methods × 12 datasets × multiple metrics) without any correction for multiple comparisons (no Bonferroni, Holm, or similar corrections).",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors implement their own versions of baselines (AgentFlow, Router-R1, Orchestrator per Table 10). No acknowledgment of author evaluation bias per Lucic et al. (2018), no independent evaluation.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "Figure 5(a) shows token consumption but does not plot performance as a function of compute budget. No analysis of whether FlowSteer's gains justify its computational overhead compared to simpler approaches.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper uses standard benchmarks without any discussion of whether they actually measure the claimed capabilities. No analysis of construct validity for any benchmark.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": true,
    422           "justification": "All workflow methods use the same backend LLM (GPT-4o-mini) for execution, controlling for model capability. The different orchestration approaches (scaffold) are the experimental variable being compared. Table 3 header confirms 'Agent+RL (4o-mini)' and 'Ours (4o-mini)'.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of temporal leakage. Many benchmarks (HumanEval 2021, GSM8K 2021, MATH 2021) were published years before the models were trained, meaning solutions may exist in training data.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the evaluation setup leaks information. The workflow canvas provides execution feedback during orchestration, but there is no analysis of whether this creates an unfair advantage versus baselines.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether training and test examples from the same benchmarks share structural similarities or come from overlapping distributions.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No concrete leakage detection or prevention method is used — no canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "FlowSteer significantly outperforms all baselines on 12 benchmarks across math, QA, and code generation",
    457       "evidence": "Tables 3 and 4 show FlowSteer achieving highest scores on all reported metrics across 6 IID and 6 OOD benchmarks",
    458       "supported": "moderate"
    459     },
    460     {
    461       "claim": "CWRPO outperforms GRPO and DAPO under identical training settings",
    462       "evidence": "Table 6 shows CWRPO achieving the highest accuracy on all 6 IID benchmarks compared to GRPO and DAPO",
    463       "supported": "moderate"
    464     },
    465     {
    466       "claim": "FlowSteer generalizes to OOD benchmarks without task-specific fine-tuning of the backend LLM",
    467       "evidence": "Table 4 shows consistent improvements on TriviaQA, NQ, MathQA, AIME 2025, APPS, DS-1000 not seen during training",
    468       "supported": "moderate"
    469     },
    470     {
    471       "claim": "FlowSteer reduces token consumption compared to ablation variants while maintaining higher accuracy",
    472       "evidence": "Figure 5(a) shows lower token usage for FlowSteer Full vs. w/o Agent, w/o Multi-turn, w/o Canvas, w/o RL across all task types",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Diversity-constrained rewards with conditional release suppress shortcut behaviors",
    477       "evidence": "Appendix B.3 provides a theoretical proof; Table 6 shows CWRPO outperforming GRPO/DAPO which lack diversity constraints",
    478       "supported": "weak"
    479     },
    480     {
    481       "claim": "Flow-Director trained on one backend transfers effectively to diverse LLM backends",
    482       "evidence": "Figure 4 shows performance improvements across 6 backends, but model names are inconsistent between Figure 4 and Appendix E5",
    483       "supported": "weak"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval",
    488     "theoretical"
    489   ],
    490   "key_findings": "FlowSteer proposes an end-to-end RL framework where a lightweight policy model (Qwen3-8B, Flow-Director) learns to orchestrate multi-step workflows through iterative interaction with an executable canvas environment. The CWRPO algorithm introduces diversity-constrained rewards with conditional release, requiring workflows to achieve structural quality criteria before answer rewards are unlocked, suppressing shortcut behaviors. Evaluation on 12 benchmarks shows consistent improvements over baselines in math reasoning, QA, and code generation, with the trained Flow-Director transferring to OOD settings and different LLM backends without task-specific retraining.",
    491   "red_flags": [
    492     {
    493       "flag": "Model name inconsistencies",
    494       "detail": "Figure 4 caption lists 'GPT-5.2', 'Grok-4.1-Fast', 'Gemini-3-Flash' while Appendix E5 references 'GPT-5', 'Grok-4-Fast', 'Gemini-2.5-Flash' for the same backends — suggesting fabricated or misidentified model names in the transferability experiments."
    495     },
    496     {
    497       "flag": "No significance tests",
    498       "detail": "Despite comparative performance claims across 8 baselines on 12 datasets, no statistical significance tests are reported; standard deviations are provided but not tested."
    499     },
    500     {
    501       "flag": "Tiny test sets without justification",
    502       "detail": "Only 128 test samples per benchmark (30 for AIME 2025) are used with no power analysis or justification; results on 128 samples are unreliable for detecting modest differences."
    503     },
    504     {
    505       "flag": "No contamination analysis",
    506       "detail": "Qwen3-8B is evaluated on GSM8K, HumanEval, and MATH — benchmarks commonly included in LLM pretraining corpora — with no discussion of potential contamination."
    507     },
    508     {
    509       "flag": "Suspiciously large improvements on some benchmarks",
    510       "detail": "SQuAD v2 EM improvement of +30.46pp over the best IID baseline (73.44% → 78.12%) is implausibly large for a workflow orchestration framework with no explanation provided."
    511     },
    512     {
    513       "flag": "Unfounded domain applicability claims",
    514       "detail": "Appendix K asserts applicability to law, healthcare, and finance with no supporting experiments, constituting overclaiming beyond the scope of the evidence."
    515     }
    516   ],
    517   "cited_papers": [
    518     {
    519       "title": "AFlow: Automating Agentic Workflow Generation",
    520       "relevance": "Key baseline using MCTS for workflow optimization; directly compared against FlowSteer as a search-based method"
    521     },
    522     {
    523       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    524       "relevance": "Foundation work on RL-based LLM reasoning that motivates the CWRPO design"
    525     },
    526     {
    527       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning (GRPO paper)",
    528       "relevance": "Introduces GRPO algorithm that CWRPO directly extends and compares against"
    529     },
    530     {
    531       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    532       "relevance": "Key prior work on multi-agent workflow systems with SOP-based orchestration"
    533     },
    534     {
    535       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    536       "relevance": "Foundation for the ReAct-based agent paradigm used by FlowSteer's Flow-Director"
    537     },
    538     {
    539       "title": "Language Agents as Optimizable Graphs (GPTSwarm)",
    540       "relevance": "Related work on treating agent workflows as optimizable graph structures"
    541     },
    542     {
    543       "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning (LATS)",
    544       "relevance": "Competing automated workflow optimization baseline combining search and execution feedback"
    545     },
    546     {
    547       "title": "DAPO: An Open-Source LLM Reinforcement Learning System at Scale",
    548       "relevance": "RL algorithm directly compared to CWRPO in the RL algorithm comparison study"
    549     },
    550     {
    551       "title": "Router-R1: Teaching LLMs Multi-Round Routing and Aggregation via Reinforcement Learning",
    552       "relevance": "Competing agent+RL baseline directly compared in main experiments"
    553     },
    554     {
    555       "title": "Agent Workflow Memory",
    556       "relevance": "Related work on reusable workflow memory for improving agentic efficiency"
    557     }
    558   ],
    559   "engagement_factors": {
    560     "practical_relevance": {
    561       "score": 2,
    562       "justification": "Automated workflow orchestration via RL addresses real pipeline-building pain points and code is publicly released."
    563     },
    564     "surprise_contrarian": {
    565       "score": 1,
    566       "justification": "Using a small policy model to direct larger backend LLMs is interesting but follows an expected research trajectory in RL-for-agents."
    567     },
    568     "fear_safety": {
    569       "score": 0,
    570       "justification": "No safety concerns or risks are discussed; the framework focuses purely on task performance optimization."
    571     },
    572     "drama_conflict": {
    573       "score": 0,
    574       "justification": "No controversial claims or conflict with established consensus."
    575     },
    576     "demo_ability": {
    577       "score": 2,
    578       "justification": "Code is released on GitHub and a demo link is provided in the abstract; the system can be run given appropriate API access."
    579     },
    580     "brand_recognition": {
    581       "score": 1,
    582       "justification": "Authors are from NTU, NUS, and CUHK Shenzhen — recognized institutions but not top AI labs (Google, OpenAI, Meta)."
    583     }
    584   },
    585   "hn_data": {
    586     "threads": [],
    587     "top_points": 0,
    588     "total_points": 0,
    589     "total_comments": 0
    590   }
    591 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs