scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20777B)
      1 {
      2   "paper": {
      3     "title": "BioTrouble: A Multi-Agent Workflow for Troubleshooting Molecular Biology Techniques",
      4     "authors": ["Mehrdad Ameri", "Hannie Yousefabadi", "Amin Ramezani"],
      5     "year": 2026,
      6     "venue": "bioRxiv",
      7     "doi": "10.64898/2025.12.30.697016"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/mehrdadameri/BioTrouble. The paper states 'All code and data related to this project are available at GitHub.'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'All code and data related to this project are available at GitHub' with a link to the repository."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is mentioned in the paper. The paper mentions LangGraph framework and OpenRouter API but does not specify library versions or dependencies."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README instructions or commands to replicate experiments are described."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are presented as bar charts (Figures 4, 5, 6) but no confidence intervals, error bars, or ± notation are mentioned in the text."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims 'the multi-agent achieved significantly better scores than the baseline' but no statistical significance tests (p-values, t-tests, etc.) are reported."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports '40% cost reduction' with specific dollar amounts ($0.0036 vs $0.006 per request). Performance comparisons across metrics are shown in figures with baseline context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "200 examples used for evaluation and 20 questions for learning system evaluation. No justification for why these sizes were chosen. The paper acknowledges '200 questions... represents a small fraction' but provides no power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviation, variance, or spread measures reported. Results appear to be single-run evaluations. The learning system evaluation uses 5 iterations but no variance across runs is reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares BioTrouble against baseline models: 'We used a mirror of the model assigned to this generator agent as a baseline to compare its results with the full BioTrouble pipeline.' GPT-4o baseline is also included."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include GPT-4o (state-of-the-art LLM) and the same SLMs (Qwen3 8B, Llama 3.1 8B, Gemma 2 9B) used standalone. These are contemporary models."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is conducted. The system has multiple components (RAG, model routing, learning system, 8 agents) but none are individually removed to measure their contribution."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used including Wrong Rate and other metrics shown in Figures 4 and 5. The paper references 'all metrics we considered for this evaluation.'"
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Evaluation uses LLM-as-a-Judge only: 'In an LLM-as-a-Judge evaluation using 200 examples.' No human expert evaluation of the troubleshooting output quality is performed. The learning system uses human feedback but that is part of the system, not an independent evaluation."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No mention of train/test split or held-out test set. The 200 evaluation examples are not described as being separated from any development data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper provides categorical comparison by difficulty level: 'in a categorical comparison (difficulty level of questions), the multi-agent achieved significantly better scores than the baseline across all four model evaluation runs.'"
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The learning system evaluation specifically selects 'the top 20 challenging questions that the agent struggled to answer during the main evaluation,' indicating failure cases were identified and addressed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Every experiment shows improvement. No configurations that failed or approaches that were tried and abandoned are reported."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims that BioTrouble 'generated comparable troubleshooting recommendations using small language models' compared to SOTA LLM are supported by the evaluation in Figures 4-5 showing SLMs achieving comparable scores to GPT-4o."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims like 'BioTrouble can generate a better troubleshooting plan based on user feedback' and that the multi-agent workflow causes improvement over baselines. However, the comparison is full pipeline vs standalone model — many variables change simultaneously (RAG, routing, multi-agent coordination), making it impossible to attribute causation to any specific component."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Molecular Biology Techniques' broadly, but the knowledge base covers a limited set of experiments (Table 2). The paper acknowledges 'we didn't cover all existing molecular biology experiments' in the discussion but the title and abstract do not bound this."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations for the results are discussed. The improvement could be due to RAG providing relevant context rather than the multi-agent architecture, but this is not explored. No threats-to-validity section."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models listed by name (e.g., 'GPT-4o', 'gpt-4o-mini', 'gpt-4.1-mini', 'qwen3-8b', 'llama-3.1-8b', 'gemma-2-9b', 'granite-4.0-h-micro') but no snapshot dates or API versions are provided for the OpenAI models."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No actual prompt text is provided. The paper describes agent roles in natural language but does not include the system prompts or instructions given to any of the 8 agents."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the models used. Token thresholds for model routing are mentioned but specific values are not given."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The multi-agent architecture is described in detail: 8 specialized agents, LangGraph framework, state-based architecture, model routing system, RAG retrieval, learning system with Reflector and Curator agents. Workflow diagrams referenced in Figures 1-3."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The 200 evaluation questions are not described in terms of how they were created or selected. The knowledge base construction process is briefly mentioned ('carefully gathered protocols and troubleshooting guidelines from sources licensed under CC BY') but details of curation are missing."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Limitations are discussed in the Discussion section: 'A primary limitation of this project is the availability of free high-quality data' and 'Another limitation of this project is the small evaluation dataset.'"
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific limitations mentioned: limited CC BY data availability affecting knowledge base quality, small evaluation dataset of 200 questions, incomplete coverage of molecular biology experiments. These are specific to this study."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While limitations are mentioned, the paper does not explicitly state what the results do NOT show. No explicit bounding of claims to the tested molecular biology experiments or specific models evaluated."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper states 'All code and data related to this project are available at GitHub: https://github.com/mehrdadameri/BioTrouble.'"
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The 200 evaluation questions' creation/collection process is not described. The knowledge base sources are vaguely described as 'protocols and troubleshooting guidelines from sources licensed under Creative Commons Attribution (CC BY)' without specifics."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants in the evaluation. The LLM-as-a-Judge evaluation and learning system evaluation do not involve recruited participants."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline from knowledge base construction to evaluation questions to final results is not documented with filtering criteria or counts of examples at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding acknowledgment or statement about funding sources appears in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Shiraz University of Medical Sciences, Shiraz Institute for Cancer Research, and University of Tehran. No product being evaluated is affiliated with these institutions."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed. The absence of a funding statement does not confirm unfunded status."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The paper includes a 'Competing interests' section stating 'All authors declare that they have no competing interests.'"
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate pre-trained model capability on a benchmark. It evaluates a multi-agent system's troubleshooting output quality using LLM-as-a-Judge, not testing model knowledge on known test sets."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable — the paper evaluates a RAG-based system pipeline, not a pre-trained model's knowledge on a benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — the evaluation uses custom troubleshooting questions, not a public benchmark that models may have seen during training."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in the study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Cost per request reported: 'BioTrouble averaged $0.0036 per request, while GPT-4o averaged $0.006' based on 100 requests via OpenRouter API."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Total computational budget not stated. Per-request cost is given but total API spend, GPU hours for local models, or total evaluation cost is not reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "BioTrouble multi-agent workflow outperforms baseline standalone models across all evaluation metrics",
    286       "evidence": "LLM-as-a-Judge evaluation on 200 examples, results in Figures 4-5 showing multi-agent scores higher than baselines for all four model configurations.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "SLMs in BioTrouble achieve comparable performance to GPT-4o baseline",
    291       "evidence": "Figure 5 shows all three SLMs (Qwen3 8B, Llama 3.1 8B, Gemma 2 9B) achieving 'scores nearly good as the GPT-4o baseline,' with Llama 3.1 8B outperforming GPT-4o on some metrics.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "BioTrouble is 40% cheaper than GPT-4o while maintaining comparable functionality",
    296       "evidence": "$0.0036 vs $0.006 per request based on 100 requests via OpenRouter API.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "BioTrouble's learning system improves response quality over 5 iterations with user feedback",
    301       "evidence": "Top 20 challenging questions re-evaluated over 5 conversations with human feedback, improvement shown in Figure 6. Only tested with Llama 3.1 8B.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval", "case-study"],
    306   "key_findings": "BioTrouble is a multi-agent RAG-based system using 8 specialized agents for molecular biology troubleshooting. Small language models (8-9B parameters) in the multi-agent pipeline achieve comparable performance to standalone GPT-4o at 40% lower cost ($0.0036 vs $0.006 per request). The system includes a feedback-driven learning mechanism that improves responses over iterations. Evaluation uses LLM-as-a-Judge on 200 questions without statistical rigor (no error bars, significance tests, or ablations).",
    307   "red_flags": [
    308     {
    309       "flag": "LLM-as-a-Judge without human validation",
    310       "detail": "All evaluation relies on LLM-as-a-Judge with no human expert evaluation of troubleshooting quality. For a domain-specific tool in molecular biology, expert validation is critical — an LLM judge may not catch scientifically incorrect advice."
    311     },
    312     {
    313       "flag": "No ablation study",
    314       "detail": "The system has 8 agents, RAG, model routing, and a learning system, but no ablation shows which components contribute to performance. The improvement could be entirely due to RAG rather than the multi-agent architecture."
    315     },
    316     {
    317       "flag": "No statistical rigor",
    318       "detail": "No error bars, confidence intervals, significance tests, or variance across runs reported despite claims of 'significantly better scores.' Results appear to be single-run evaluations on 200 examples."
    319     },
    320     {
    321       "flag": "Small evaluation dataset",
    322       "detail": "Only 200 questions for main evaluation and 20 for learning system evaluation. The paper acknowledges this is 'a small fraction of the troubleshooting scenarios' but draws broad conclusions."
    323     },
    324     {
    325       "flag": "Unfair baseline comparison",
    326       "detail": "The baseline is a standalone model without RAG or any context, while BioTrouble provides curated domain knowledge via RAG. This comparison tests whether RAG helps, not whether multi-agent architecture helps."
    327     }
    328   ],
    329   "cited_papers": [
    330     {
    331       "title": "A survey on large language model based autonomous agents",
    332       "authors": ["L. Wang", "C. Ma", "X. Feng"],
    333       "year": 2024,
    334       "relevance": "Foundational survey on LLM-based autonomous agents relevant to understanding agentic AI architectures."
    335     },
    336     {
    337       "title": "Small Language Models are the Future of Agentic AI",
    338       "authors": ["P. Belcak", "G. Heinrich"],
    339       "year": 2025,
    340       "arxiv_id": "2506.02153",
    341       "relevance": "Directly relevant to the survey's scope on small language models for agentic systems."
    342     },
    343     {
    344       "title": "Small Language Models for Agentic Systems: A Survey of Architectures, Capabilities, and Deployment Trade offs",
    345       "authors": ["R. Sharma", "M. Mehta"],
    346       "year": 2025,
    347       "arxiv_id": "2510.03847",
    348       "relevance": "Survey of SLM architectures for agentic systems, relevant to understanding the landscape of small model agents."
    349     },
    350     {
    351       "title": "Agentic context engineering: Evolving contexts for self-improving language models",
    352       "authors": ["Q. Zhang", "C. Hu"],
    353       "year": 2025,
    354       "arxiv_id": "2510.04618",
    355       "relevance": "ACE framework that BioTrouble's learning system is inspired by; relevant to context engineering for AI agents."
    356     },
    357     {
    358       "title": "Judging llm-as-a-judge with mt-bench and chatbot arena",
    359       "authors": ["L. Zheng", "W.-L. Chiang"],
    360       "year": 2023,
    361       "relevance": "Foundational work on LLM-as-a-Judge evaluation methodology used by BioTrouble."
    362     },
    363     {
    364       "title": "A survey on rag meeting llms: Towards retrieval-augmented large language models",
    365       "authors": ["W. Fan", "Y. Ding"],
    366       "year": 2024,
    367       "relevance": "Survey on RAG techniques relevant to understanding retrieval-augmented generation in agentic systems."
    368     },
    369     {
    370       "title": "Retrieval-augmented generation for large language models: A survey",
    371       "authors": ["Y. Gao", "Y. Xiong"],
    372       "year": 2023,
    373       "arxiv_id": "2312.10997",
    374       "relevance": "Comprehensive RAG survey relevant to the survey's coverage of retrieval-augmented LLM systems."
    375     },
    376     {
    377       "title": "The rise and potential of large language model based agents: a survey",
    378       "authors": ["Z. Xi", "W. Chen"],
    379       "year": 2025,
    380       "relevance": "Major survey on LLM-based agents relevant to the survey's scope on agentic AI systems."
    381     }
    382   ]
    383 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs