scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23819B)
      1 {
      2   "paper": {
      3     "title": "AutoFlow: Automated Workflow Generation for Large Language Model Agents",
      4     "authors": [
      5       "Zelong Li",
      6       "Shuyuan Xu",
      7       "Kai Mei",
      8       "Wenyue Hua",
      9       "Balaji Rama",
     10       "Om Raheja",
     11       "Hao Wang",
     12       "He Zhu",
     13       "Yongfeng Zhang"
     14     ],
     15     "year": 2024,
     16     "venue": "arXiv preprint",
     17     "arxiv_id": "2407.12821"
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract states: 'The source code of this work is available at https://github.com/agiresearch/AutoFlow.' A concrete GitHub URL is provided."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper uses the publicly available OpenAGI benchmark dataset (Ge et al. 2023, NeurIPS). This is a standard public benchmark that was not modified by the authors."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Section 5.4 mentions the framework is 'implemented by PyTorch' and references the OpenAGI platform and DSPy framework, but no requirements.txt, Dockerfile, library versions, or detailed environment specification is provided in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are included in the paper. Section 5.4 describes implementation details at a high level but does not provide commands or a README-style guide to replicate the experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables 1 and 2 report only point estimates (e.g., 0.3597, 0.6501) for all tasks and methods. No confidence intervals, error bars, or +/- notation is provided."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims AutoFlow has 'over 40% improvement' and 'over 5% improvement' over baselines (Section 5.5) but provides no statistical significance tests to support these comparative claims."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports percentage improvements with baseline context: 'Compared to the best baseline, CoRE, AutoFlow has over 40% improvement when using Mixtral as the LLM interpreter, and over 5% improvement when using GPT-4 as the interpreter LLM' (Section 5.5). The raw scores in Tables 1 and 2 provide the baselines (e.g., CoRE average 0.2483 vs AutoFlow 0.3597)."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for the number of tasks (3 task types) or the number of iterations (30). No power analysis or discussion of whether this sample size is adequate for the claims made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No standard deviations, variance, or multiple-run results are reported. Tables 1 and 2 show single-run point estimates only with no indication of result stability across different runs or seeds."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Tables 1 and 2 compare AutoFlow against four baselines: Zero-shot Learning, Chain-of-Thought, Few-shot Learning, and CoRE (manually designed workflow)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The baselines include CoRE (2024), DSPy-based CoT (2023), and standard prompting strategies. These are contemporary methods for LLM agent planning at the time of publication."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The AutoFlow framework has multiple components (workflow generator, interpreter, RL optimization, parser, LoRA), but no ablation study is presented to isolate the contribution of individual components."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Three different metrics are used for three task types: CLIP Score for text-to-image tasks (Task 1), BERT Score for text tasks (Task 2), and ViT Score for image tasks (Task 3), as described in Section 5.3."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper claims workflows are 'readable by humans' and 'keeping readability' (Section 1), but no human evaluation study is conducted to validate these claims about human readability or workflow quality."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Section 4.1 mentions a 'validation dataset' for workflow optimization, but there is no explicit mention of a separate held-out test set. It is unclear whether the reported results are on data used for workflow selection/optimization."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Tables 1 and 2 provide per-task breakdowns (Task 1, 2, 3) with different metrics, plus an average across tasks, rather than just a single aggregate score."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No failure cases or error analysis is provided. Section 5.4 briefly mentions that ReAct and Program-of-Thought could not be used as baselines but does not discuss where AutoFlow fails."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "Every result shown is positive for AutoFlow. The only mention of something not working is the inability to use ReAct and Program-of-Thought as baselines (Section 5.4), which is about baselines, not about AutoFlow configurations that failed."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims the framework 'can produce robust and reliable agent workflows.' Tables 1 and 2 show AutoFlow outperforming baselines on the OpenAGI benchmark, which supports the effectiveness claim within the tested setting."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes causal claims such as 'the combination of different systems... might lead to a kind of synergistic effect' (Section 5.5) and generally implies that AutoFlow causes performance improvement. However, no ablation or controlled experiment isolates the causal mechanism, and there is only one benchmark with no variance reporting."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The abstract and title claim general 'Automated Workflow Generation for Large Language Model Agents' but experiments are conducted on only one benchmark (OpenAGI) with only 3 task types. The paper does not bound its claims to this specific benchmark setting. The conclusion states AutoFlow 'can reach better performance and significantly reduce the human labor' without qualification."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations for the results are discussed. For instance, the paper does not consider whether the improvement comes from the RL optimization, the use of GPT-4 as a parser, the particular format of CoRE workflows, or some combination. The synergy observation in Section 5.5 is noted but not investigated."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 5.1 states: 'we use the GPT-4-1106-preview version' and 'Mixtral-8x7B.' The GPT-4 version is specified with a snapshot identifier."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper shows an example workflow generation query in natural language (Section 4.1, Figure 2) but does not provide the actual full prompt text sent to the models. The task descriptions are paraphrased (e.g., 'Provide a workflow with several steps...') but the complete system prompts, few-shot examples used, and exact query formats are not fully specified."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.4 reports: 30 iterations, REINFORCE algorithm, Adam optimizer with learning rate 0.001, LoRA rank 8. These are the key hyperparameters for the framework."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The paper describes the agentic scaffolding in detail: the CoRE language system (Section 3.1) with step types (Process, Decision, Terminal), the interpreter LLM execution procedure (4 steps in Section 3.1.2), the generator-interpreter architecture (Section 4), and the GPT-4 parser for grammar correction (Section 4.1)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper states it uses the OpenAGI benchmark but does not describe how the data was preprocessed, how it was split into training/validation, or any filtering steps applied to the benchmark tasks."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations or threats-to-validity section. Section 6 ('Conclusions and Future Work') briefly mentions that 'RL may not be the most efficient' and suggests future directions, but this is not a substantive limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The brief future work mentions in Section 6 are about potential improvements, not about threats to the current study's validity."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what its results do NOT show. The title and claims suggest broad applicability ('Large Language Model Agents') but experiments cover only one benchmark (OpenAGI) with two models. No scope boundaries are articulated."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The raw experimental outputs, intermediate workflows generated during training, and per-instance results are not made available. Only aggregate scores in Tables 1 and 2 are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper uses the OpenAGI benchmark but does not describe how the specific task instances were selected, how many instances per task type, or the data collection procedure for the evaluation."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved; this is a purely computational benchmark evaluation study."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from raw benchmark data to final reported scores is not documented. It is unclear how many data instances were used for training vs. validation, how rewards were aggregated, or how the final workflow was selected."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, corporate sponsors, or funding agencies."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: seven authors from Rutgers University and two independent researchers. The evaluated products (GPT-4, Mixtral) are not from Rutgers."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Since funding is not disclosed at all, independence cannot be assessed. The absence of funding disclosure is a concern."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial interest declaration is provided in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses GPT-4-1106-preview and Mixtral-8x7B to evaluate on the OpenAGI benchmark but does not state the training data cutoff dates for either model."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether OpenAGI benchmark tasks or their solutions could have appeared in the training data of GPT-4 or Mixtral."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The OpenAGI benchmark was published in 2023 (NeurIPS). GPT-4 models may have been trained on data that includes these benchmark descriptions. No contamination analysis is provided."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The framework calls GPT-4 and Mixtral multiple times (30 iterations of workflow generation, each requiring workflow execution on the dataset), but no API costs, token counts, or latency figures are reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget, GPU hours, or API spend is stated despite the framework requiring significant compute for RL training over 30 iterations with LLM calls."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "AutoFlow-generated workflows outperform manually designed ones (CoRE) by over 40% when using Mixtral as interpreter and over 5% when using GPT-4 as interpreter.",
    296       "evidence": "Tables 1 and 2 in Section 5.5. With Mixtral interpreter: CoRE average 0.2483 vs AutoFlow(GPT) 0.3597. With GPT-4 interpreter: CoRE average 0.6104 vs AutoFlow(Mixtral) 0.6501.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The combination of different LLMs for generator and interpreter leads to a synergistic effect that improves performance.",
    301       "evidence": "Section 5.5 observes that the best Mixtral-interpreter score uses GPT-4 as generator, and vice versa. However, this is based on a single benchmark with no controlled experiment to isolate the synergy mechanism.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "AutoFlow can produce robust and reliable agent workflows.",
    306       "evidence": "Abstract claim. Results on OpenAGI (Tables 1-2) show improvement over baselines, but robustness is not tested across diverse benchmarks, and reliability is not quantified with variance or multiple runs.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "AutoFlow significantly reduces human labor compared to manually designed workflows.",
    311       "evidence": "Section 6 conclusion. No empirical measurement of human labor reduction is provided; this is stated as a logical consequence of automation rather than measured.",
    312       "supported": "unsupported"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "benchmark-eval"
    317   ],
    318   "key_findings": "AutoFlow is a framework that uses reinforcement learning to automatically generate natural-language workflows for LLM agents, offering both fine-tuning (LoRA on open-source LLMs) and in-context learning (for closed-source LLMs) methods. On the OpenAGI benchmark, AutoFlow-generated workflows outperform manually designed CoRE workflows and other prompting baselines across three task types (text-to-image, text-to-text, image-to-image). An interesting cross-model effect is observed where using different LLMs for generator and interpreter yields the best results.",
    319   "red_flags": [
    320     {
    321       "flag": "Single benchmark evaluation",
    322       "detail": "All experiments are on a single benchmark (OpenAGI) with only 3 task types. Broad claims about 'automated workflow generation for LLM agents' are not supported by such narrow evaluation."
    323     },
    324     {
    325       "flag": "No variance or uncertainty quantification",
    326       "detail": "Results in Tables 1 and 2 are single-point estimates with no error bars, standard deviations, or confidence intervals. Given the stochastic nature of both LLM outputs and RL training, result stability is unknown."
    327     },
    328     {
    329       "flag": "No ablation study",
    330       "detail": "The framework has multiple components (RL optimization, GPT-4 parser for grammar correction, LoRA adapter, CoRE format) but no ablation isolates which components contribute to the improvement."
    331     },
    332     {
    333       "flag": "No limitations section",
    334       "detail": "The paper lacks any dedicated discussion of limitations or threats to validity, making it difficult to assess the boundaries of the claims."
    335     },
    336     {
    337       "flag": "Unquantified computational cost",
    338       "detail": "The framework requires 30 iterations of LLM workflow generation and execution, each involving multiple LLM calls. The practical cost is never reported, making it hard to assess whether the approach is viable."
    339     },
    340     {
    341       "flag": "Self-citation concentration",
    342       "detail": "Several key components of the system (CoRE language, OpenAGI benchmark, AIOS, Formal-LLM) are by the same research group. The evaluation is entirely on their own benchmark, which may introduce favorable experimental conditions."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "OpenAGI: When LLM Meets Domain Experts",
    348       "authors": ["Yingqiang Ge", "Wenyue Hua", "Kai Mei", "Jianchao Ji", "Juntao Tan", "Shuyuan Xu", "Zelong Li", "Yongfeng Zhang"],
    349       "year": 2023,
    350       "relevance": "The benchmark used for all experiments in this paper; relevant as an LLM agent evaluation framework."
    351     },
    352     {
    353       "title": "CoRE: LLM as Interpreter for Natural Language Programming, Pseudo-Code Programming, and Flow Programming of AI Agents",
    354       "authors": ["Shuyuan Xu", "Zelong Li", "Kai Mei", "Yongfeng Zhang"],
    355       "year": 2024,
    356       "arxiv_id": "2405.06907",
    357       "relevance": "Defines the natural language programming framework (CoRE) used as the workflow representation in AutoFlow."
    358     },
    359     {
    360       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    361       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    362       "year": 2023,
    363       "arxiv_id": "2308.08155",
    364       "relevance": "Multi-agent LLM framework for workflow-based task solving, relevant as a comparison system for agentic programming."
    365     },
    366     {
    367       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    368       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    369       "year": 2023,
    370       "relevance": "Key agentic reasoning framework that combines reasoning and tool use; mentioned as attempted but inapplicable baseline."
    371     },
    372     {
    373       "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    374       "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari"],
    375       "year": 2023,
    376       "arxiv_id": "2310.03714",
    377       "relevance": "Framework for optimizing LLM pipelines used to implement CoT baseline; relevant to automated LLM workflow optimization."
    378     },
    379     {
    380       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    381       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    382       "year": 2022,
    383       "relevance": "Foundational prompting technique used as a baseline in this paper's evaluation."
    384     },
    385     {
    386       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Language Model Society",
    387       "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud"],
    388       "year": 2023,
    389       "relevance": "Multi-agent communication framework relevant to understanding LLM agent collaboration and workflow design."
    390     },
    391     {
    392       "title": "Communicative Agents for Software Development",
    393       "authors": ["Chen Qian", "Xin Cong", "Wei Liu"],
    394       "year": 2023,
    395       "arxiv_id": "2307.07924",
    396       "relevance": "LLM agent framework for software development (ChatDev), relevant to agentic workflow design in coding contexts."
    397     },
    398     {
    399       "title": "FlowMind: Automatic Workflow Generation with LLMs",
    400       "authors": ["Zhen Zeng", "William Watson", "Nicole Cho"],
    401       "year": 2024,
    402       "arxiv_id": "2404.13050",
    403       "relevance": "Directly related work on automatic workflow generation with LLMs, a key comparison point."
    404     },
    405     {
    406       "title": "A Survey on Large Language Model based Autonomous Agents",
    407       "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"],
    408       "year": 2023,
    409       "arxiv_id": "2308.11432",
    410       "relevance": "Comprehensive survey of LLM-based autonomous agents, providing context for the agentic AI landscape."
    411     },
    412     {
    413       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    414       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    415       "year": 2024,
    416       "relevance": "Advanced reasoning framework for LLMs relevant to structured problem-solving approaches."
    417     },
    418     {
    419       "title": "Formal-LLM: Integrating Formal Language and Natural Language for Controllable LLM-based Agents",
    420       "authors": ["Zelong Li", "Wenyue Hua", "Hao Wang", "He Zhu", "Yongfeng Zhang"],
    421       "year": 2024,
    422       "arxiv_id": "2402.00798",
    423       "relevance": "Related work on integrating formal language with LLM agents for controllability, from the same research group."
    424     }
    425   ]
    426 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs