scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26322B)
      1 {
      2   "paper": {
      3     "title": "Build Your Personalized Research Group: A Multiagent Framework for Continual and Interactive Science Automation",
      4     "authors": ["Ed Li", "Junyu Ren", "Xintian Pan", "Cat Yan", "Chuanhao Li", "Dirk Bergemann", "Zhuoran Yang"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.15624",
      8     "doi": "10.48550/arXiv.2510.15624"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The paper provides a GitHub link: github.com/ltjed/freephdlabor, stated in the abstract and on the first page. The framework is described as open-source."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No dataset is released. The paper presents a framework with an execution trace case study but does not release any experimental data, logs, or artifacts from the demonstrated execution."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency versions are listed in the paper. The paper mentions the framework is built on the smolagents library but provides no version or setup details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper refers readers to the GitHub repository and a blog but does not include a 'Reproducing Results' section or specific commands."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": false,
     36         "answer": false,
     37         "justification": "The paper does not present quantitative experimental results with numerical metrics. It describes a framework and provides a qualitative execution trace. No statistical analysis is performed."
     38       },
     39       "significance_tests": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "No comparative quantitative claims are made. The paper compares features of systems in Table 1 using qualitative checkmarks, not numerical performance. No significance testing is applicable."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No quantitative effects are measured. The paper is a framework description with a qualitative case study, not an empirical evaluation."
     48       },
     49       "sample_size_justified": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No quantitative experiments are conducted, so sample size justification is not applicable."
     53       },
     54       "variance_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No experimental runs producing numerical results are reported. The single execution trace is qualitative."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Table 1 compares features (architecture type, dynamic workflow, customizability, open-source) across systems qualitatively. However, there is no empirical baseline comparison showing that freephdlabor produces better research outputs, runs faster, or is more reliable than any of these systems."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "While Table 1 lists contemporary systems (AI co-scientist 2025, Robin 2025, AI Scientist-v2 2025), the comparison is only on feature checkmarks, not empirical performance. No contemporary baseline is tested head-to-head."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The framework has multiple components (workspace communication, context compaction, memory persistence, human intervention), but no ablation study measures the contribution of any individual component."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No evaluation metrics of any kind are used to assess the framework. The execution trace mentions a reviewer score of 5/10 improving to 7/10, but this is an internal self-evaluation by the system's own ReviewerAgent, not an external evaluation metric."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human evaluation of the framework's outputs is reported. The execution trace does not include any external human assessment of the generated paper's quality."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No test set or benchmark is used. The paper presents a framework with a single execution trace demonstration."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No per-category breakdown is provided. The single execution trace does not break down performance across tasks, domains, or categories."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The execution trace in Stages 2-3 explicitly describes a failure case: the ResourcePreparationAgent failed to create a symlink, the WriteupAgent could not find experiment data, and the system had to recover. The Discussion section also addresses agent deception as a failure mode."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The Discussion section candidly reports agent deception: 'agents in freephdlabor can exhibit deceptive behavior under stringent requirements. For example, when the ExperimentationAgent is asked to produce a pdf with a length requirement, it may generate a placeholder document with low-information content.'"
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The abstract claims the framework provides 'fully dynamic workflows,' 'seamless customization,' and 'comprehensive infrastructure.' While the paper describes these features architecturally, there is no empirical evidence that they work as claimed beyond a single execution trace. The abstract says the framework 'transform[s] automated research from isolated, single-run attempts into continual research programs' but this is demonstrated only by description, not rigorous evaluation."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper makes implicit causal claims: that the dynamic workflow 'enables' adaptive research, that the workspace system 'eliminates' information degradation, and that context compaction allows 'theoretically unbounded conversation length.' These are causal claims about system capabilities with no controlled experiments to support them. For example, there is no comparison showing that workspace-based communication actually reduces information loss relative to string-based communication."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper claims broad applicability across 'scientific domains' and positions the framework as 'enabling practitioners to deploy interactive multiagent systems that autonomously conduct end-to-end research.' However, the only demonstration is a single AI/ML research task (HMM-based training phase detection). The title and abstract suggest generality far beyond what is demonstrated."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not discuss alternative explanations for the execution trace outcomes. For example, it does not consider whether the 7/10 reviewer score could reflect the ReviewerAgent's biases rather than genuine paper quality, or whether a fixed-pipeline system might have achieved the same outcome on this particular task."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No specific LLM model versions are stated anywhere in the paper. The paper does not specify which model(s) power the agents in the execution trace. No model name, version, or API snapshot date is provided."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper provides extensive prompt content in the main text and Appendices A-D, including the full system prompt template structure, workspace guidelines, and agent-specific instructions for all six agents. While some sections use summarized notation (e.g., '[5 additional notional tool-use examples]'), the core prompts are provided in substantial detail."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. The context compaction threshold is mentioned (75% of max context limit) and the minimum compaction interval (3 steps), but no LLM inference parameters are specified."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The agentic scaffolding is described in extensive detail: the star-shaped ManagerAgent architecture, ReAct framework, tool descriptions for each agent, workspace communication system, context compaction mechanism, memory persistence, and human intervention hooks. This is the primary contribution of the paper."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The execution trace mentions training Pythia-160M on TinyStories and expanding to IMDb, SST-2, CIFAR-10, but no data preprocessing steps are documented. The paper describes no data preparation for the demonstration."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion section addresses some concerns (agent deception, emergent vs. pre-designed workflows) but is not structured as a limitations section and does not systematically discuss the framework's limitations."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No specific threats to validity are discussed. The Discussion section mentions agent deception but frames it as a known challenge rather than a limitation of their evaluation. There is no discussion of threats to the validity of their claims about the framework."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No explicit scope boundaries are stated. The paper does not say what the framework does NOT show or what settings it has NOT been tested in. The claims suggest broad applicability across all scientific domains without bounding this to the tested domain (AI/ML research)."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw data from the execution trace is available. The execution trace is presented in summarized narrative form. No logs, agent transcripts, workspace files, or generated papers are released."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The execution trace is the primary evidence, but the data collection procedure for it is not described. It is unclear how this particular trace was selected, whether it was the first attempt, or how many runs were performed."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants are involved. The paper presents a software framework with an automated execution trace."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The execution trace workflow is described at a high level (5 stages) but the underlying data pipeline — how agent outputs were collected, how the trace was reconstructed, what was included or omitted — is not documented."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding sources are disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: Yale University, University of Chicago, and University of Oxford. No commercial affiliations that would create conflicts with the evaluated framework are apparent."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of funding disclosure prevents evaluation of this criterion."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement is present in the paper. No declaration of financial interests, patents, or equity related to the framework is provided."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper does not evaluate any pre-trained model's capability on a benchmark. It presents a multiagent framework; the execution trace is a system demonstration, not a benchmark evaluation."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "No benchmark evaluation is performed. The paper describes a framework and an execution trace, so train/test overlap is not applicable."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No benchmark is used. The paper presents a framework with a qualitative case study demonstration."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference costs are reported despite the framework making extensive use of LLM API calls across multiple agents. The execution trace involves numerous LLM calls but no cost, token count, or latency figures are provided."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget is stated. The execution trace mentions using an H100 GPU (in the IdeationAgent instructions specifying '<1 hour per run on single H100 GPU') but does not report the actual compute used for the demonstrated execution."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "freephdlabor implements fully dynamic workflows determined by real-time agent reasoning, unlike fixed-pipeline systems",
    287       "evidence": "The execution trace (Stages 1-5) shows the ManagerAgent making adaptive decisions: re-invoking ResourcePreparationAgent after a failure, requesting comprehensive revisions after a low review score. Table 1 compares against 6 other systems on architecture and workflow dynamism.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "The workspace-based communication paradigm eliminates information degradation (game of telephone effect)",
    292       "evidence": "The mechanism is described conceptually (Section: Workspace System, Figure 5) and the implementation uses file references instead of string serialization. However, no empirical evidence (e.g., measuring information loss with vs. without workspace) is provided.",
    293       "supported": "weak"
    294     },
    295     {
    296       "claim": "Context compaction allows theoretically unbounded conversation length while staying within model context limits",
    297       "evidence": "The compaction mechanism is described (3 phases: external backup, intelligent summarization, memory reconstruction) with a 75% threshold trigger. The claim is architectural/theoretical with no empirical validation of performance over long conversations.",
    298       "supported": "weak"
    299     },
    300     {
    301       "claim": "The framework enables the system to recover from errors autonomously and iterate on quality",
    302       "evidence": "The execution trace Stages 2-3 show recovery from a missing symlink error, and Stages 4-5 show revision from a 5/10 to 7/10 reviewer score. However, this is a single selected trace, not systematic evaluation.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "The framework provides a modular, customizable platform where users can modify, add, or remove agents to address domain-specific requirements",
    307       "evidence": "The modular prompt template with four sections (LIST_OF_TOOLS, WORKSPACE_GUIDELINES, AGENT_INSTRUCTIONS, MANAGED_AGENTS) is described in detail. The Discussion section discusses tool substitution for domain adaptation. However, no user study or concrete examples of domain adaptation beyond AI/ML are provided.",
    308       "supported": "weak"
    309     }
    310   ],
    311   "methodology_tags": ["case-study"],
    312   "key_findings": "The paper introduces freephdlabor, an open-source multiagent framework for automating scientific research that features dynamic workflows controlled by a central ManagerAgent, workspace-based communication to avoid information degradation, context compaction for long-horizon research, and human-in-the-loop intervention capabilities. The framework is demonstrated through a single execution trace on an HMM-based training phase detection task, where the system autonomously recovers from errors and iterates on paper quality. The paper's primary contribution is architectural — providing a modular, customizable platform for building domain-specific co-scientist systems — but it lacks quantitative evaluation comparing the framework against alternatives.",
    313   "red_flags": [
    314     {
    315       "flag": "No quantitative evaluation",
    316       "detail": "The paper presents a framework with extensive architectural description but provides zero quantitative evaluation. The only evidence is a single execution trace that appears to be cherry-picked. There are no metrics measuring research output quality, cost efficiency, error recovery rate, or comparison against any baseline system."
    317     },
    318     {
    319       "flag": "Unbounded generalization claims",
    320       "detail": "The paper claims the framework enables 'broader adoption of automated research across scientific domains' and describes it as a platform for 'building bespoke co-scientists tailored to their specific domains.' However, the only demonstration is in AI/ML research. No evidence supports claims of cross-domain applicability."
    321     },
    322     {
    323       "flag": "Self-evaluation without external validation",
    324       "detail": "The execution trace uses the system's own ReviewerAgent (an LLM reviewing LLM-generated papers) as the quality measure, with scores of 5/10 and 7/10. This is entirely internal self-assessment with no external human review or comparison to human-written papers."
    325     },
    326     {
    327       "flag": "Unknown model identity",
    328       "detail": "The paper never specifies which LLM powers the agents. This is a critical omission for a framework paper that demonstrates autonomous scientific research — the capabilities and limitations are entirely model-dependent, and results could differ dramatically across models."
    329     },
    330     {
    331       "flag": "No cost reporting for a multi-agent LLM system",
    332       "detail": "The framework involves multiple agents making numerous sequential LLM calls, yet no cost figures (API spend, tokens consumed, wall-clock time) are reported. For a system proposing to automate research, practical cost is essential information."
    333     },
    334     {
    335       "flag": "Single selected execution trace",
    336       "detail": "The entire empirical evidence consists of one execution trace on one research task. It is unclear how many times the system was run, how this trace was selected, or what the success rate is across multiple runs. A single demonstration cannot support claims about system reliability."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery",
    342       "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange", "Jakob Foerster", "Jeff Clune", "David Ha"],
    343       "year": 2024,
    344       "arxiv_id": "2408.06292",
    345       "relevance": "Foundational work on end-to-end AI research automation, a key baseline for this paper's framework."
    346     },
    347     {
    348       "title": "The AI Scientist-v2: Workshop-Level Automated Scientific Discovery via Agentic Tree Search",
    349       "authors": ["Yutaro Yamada", "Robert Tjarko Lange", "Cong Lu", "Shengran Hu", "Chris Lu", "Jakob Foerster", "Jeff Clune", "David Ha"],
    350       "year": 2025,
    351       "arxiv_id": "2504.08066",
    352       "relevance": "Successor to AI Scientist with tree-search-based code generation, directly compared in Table 1."
    353     },
    354     {
    355       "title": "Towards an AI Co-Scientist",
    356       "authors": ["Juraj Gottweis", "Wei-Hung Weng"],
    357       "year": 2025,
    358       "arxiv_id": "2502.18864",
    359       "relevance": "Google's closed-source AI co-scientist system that implements asynchronous agent orchestration, a key comparison point."
    360     },
    361     {
    362       "title": "Agent Laboratory: Using LLM Agents as Research Assistants",
    363       "authors": ["Samuel Schmidgall", "Yusheng Su", "Ze Wang"],
    364       "year": 2025,
    365       "arxiv_id": "2501.04227",
    366       "relevance": "Fully agentic system for research with fixed three-stage workflow, directly compared in Table 1."
    367     },
    368     {
    369       "title": "Why Do Multi-Agent LLM Systems Fail? (MAST)",
    370       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    371       "year": 2025,
    372       "arxiv_id": "2503.13657",
    373       "relevance": "Taxonomy of multi-agent system failures including task verification and inter-agent misalignment, relevant to survey's evaluation methodology scope."
    374     },
    375     {
    376       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    377       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    378       "year": 2024,
    379       "arxiv_id": "2401.05566",
    380       "relevance": "Evidence that deceptive strategies persist despite safety training, cited in connection with agent deception in multiagent systems."
    381     },
    382     {
    383       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    384       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    385       "year": 2023,
    386       "arxiv_id": "2210.03629",
    387       "relevance": "Core agent framework (reason-then-act) used as the foundation for all agents in freephdlabor."
    388     },
    389     {
    390       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    391       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
    392       "year": 2023,
    393       "relevance": "Multi-agent collaborative framework cited for demonstrating information degradation in agent communication."
    394     },
    395     {
    396       "title": "Automated Design of Agentic Systems",
    397       "authors": ["Shengran Hu", "Cong Lu", "Jeff Clune"],
    398       "year": 2025,
    399       "arxiv_id": "2408.08435",
    400       "relevance": "Meta-optimization of agentic systems (ADAS), representing the pre-designed workflow optimization approach contrasted with freephdlabor's runtime routing."
    401     },
    402     {
    403       "title": "Simulating and Understanding Deceptive Behaviors in Long-Horizon Interactions",
    404       "authors": ["Alexander Pan", "Sadhika Das", "Sarah Wiegreffe"],
    405       "year": 2025,
    406       "arxiv_id": "2510.03999",
    407       "relevance": "Studies deceptive behaviors in long-horizon AI agent interactions, directly relevant to safety evaluation of agentic AI systems."
    408     },
    409     {
    410       "title": "Robin: A Multi-Agent System for Automating Scientific Discovery",
    411       "authors": ["Ali Essam Ghareeb", "Benjamin Chang", "Ludovico Mitchener"],
    412       "year": 2025,
    413       "arxiv_id": "2505.13400",
    414       "relevance": "Domain-specific multiagent system for therapeutic discovery, demonstrates fixed-workflow limitations that freephdlabor addresses."
    415     },
    416     {
    417       "title": "LLM-based Multi-Agents: A Survey",
    418       "authors": ["Taicheng Guo", "Xiuying Wang", "Kunlun Wang"],
    419       "year": 2024,
    420       "arxiv_id": "2402.01680",
    421       "relevance": "Survey of LLM multi-agent systems discussing orchestration modes and dynamic coordination, relevant to the survey's scope on agentic AI."
    422     }
    423   ]
    424 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs