ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24030B)


      1 {
      2   "paper": {
      3     "title": "From Single to Multi-Agent Reasoning: Advancing GeneGPT for Genomics QA",
      4     "authors": ["Kimia Abedini", "Farzad Shami", "Gianmaria Silvello"],
      5     "year": 2026,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2601.10581",
      8     "doi": "10.48550/arXiv.2601.10581"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "GenomAgent, a multi-agent framework for genomic QA, outperforms the replicated GeneGPT system by 12% on average (0.93 vs 0.83) on the GeneTuring benchmark while reducing computational cost by 79% ($2.11 vs $10.06). The largest gains come in sequence alignment tasks (28.8% improvement). The paper also provides a reproducibility study of GeneGPT, identifying stop-token parsing failures, context loss, and incomplete data coverage as key error types.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "A project page URL is provided in the abstract: https://kimia-abedini.github.io/Genom-Agent/. This is a public URL for the project."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The evaluation uses the publicly available GeneTuring benchmark (Hou & Ji, 2023), comprising 12 tasks with 50 QA pairs each. The benchmark is a standard public dataset."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency versions are mentioned in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The project page is referenced but the paper itself contains no reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 1 and 2 are point estimates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims GenomAgent 'outperforms' GeneGPT based solely on comparing point scores (0.93 vs 0.83) with no statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage improvements are reported with baseline context throughout: '12% increase in average performance (0.93 vs. 0.83)', '79% reduction in computational cost ($2.11 vs. $10.06)', and per-category improvements in Table 2."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Each task has 50 QA pairs from the GeneTuring benchmark, but no justification is given for why this sample size is adequate for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or multiple-run statistics are reported. Results appear to be from single runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple GeneGPT configurations (Full, Slim, Turbo, Lang) serve as baselines, and the original paper's baselines (Bing Chat, BioMedLM, GPT-3) are also referenced."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The main baseline GeneGPT was published in 2024 and uses deprecated models (code-davinci-002, GPT-3.5-turbo-16k). The paper acknowledges a concurrent work (Chen et al., 2025, ref [5]) but only mentions it as future comparison rather than including it."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The paper explicitly acknowledges in Future Work (Section 6): 'the 12% average improvement cannot be cleanly attributed to specific architectural choices without systematic ablation analysis.' No ablation is performed."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple task-specific metrics are used: exact match accuracy for nomenclature, recall for association tasks, vocabulary-mapped matching for cross-species alignment, and partial scoring for human genome alignment."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The authors manually reviewed and categorized all mistakes made by the reproduced GeneGPT system into three error types (E1, E2, E3) in Section 3."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The GeneTuring benchmark is a fixed test set of 50 QA pairs per task, not used for any tuning or development."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Tables 1 and 2 provide detailed per-task breakdowns across all 9 tasks grouped into 4 categories (nomenclature, genomic location, functional analysis, sequence alignment)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 3 categorizes reproduced GeneGPT errors into E1 (incomplete data coverage), E2 (stop-token parsing failures), and E3 (context loss). However, GenomAgent's failures are not discussed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The reproducibility study shows degraded performance in turbo settings (Table 1), including an 83.33% drop on DNA-to-Human alignment. The lang configuration showed massive variability."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 12% average improvement and these numbers match Table 2 results (0.93 vs 0.83). The 79% cost reduction claim is also supported ($2.11 vs $10.06)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims GenomAgent's multi-agent architecture 'addresses efficiency bottlenecks' and 'delivers superior accuracy,' but the authors themselves acknowledge the improvement 'cannot be cleanly attributed to specific architectural choices without systematic ablation analysis.' Multiple variables change simultaneously: model (GPT-4o-mini vs deprecated models), architecture (multi-agent vs single-agent), APIs (multi-source vs single-source), and evaluation protocol adjustments."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract claims the 'flexible architecture extends beyond genomics to various scientific domains needing expert knowledge extraction' but testing is only on GeneTuring. The title says 'Genomics QA' but the generalization claim is unsupported. The paper acknowledges in limitations that evaluation is 'limited to the GeneTuring benchmark.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The improvement could be due to the model upgrade (GPT-4o-mini vs deprecated Codex/GPT-3.5), multi-source database access, or evaluation protocol changes (expanded vocabulary, partial scoring), rather than the multi-agent architecture itself. These confounds are not discussed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures specific task-level metrics (exact match, recall, partial scoring) and reports them as task-level performance. The macro-average is explicitly acknowledged as 'a simplified view of the system's diverse capabilities across heterogeneous genomics QA tasks' (Section 2)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper states 'GPT-4o-mini' as the replacement model without any version date or API snapshot identifier."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper describes prompting strategies and agent roles in natural language but does not provide actual prompt text. Agent-specific instructions are described functionally but not reproduced."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any model configuration."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 4 and Figure 1 describe the multi-agent architecture in detail: Task Detection Agent, MCP Agent, Response Handler Agent, Feature Extractor Agent, Code Writer Agent, Code Executor Agent, and Final Decision Agent, with their roles and interactions. Built on Google Agent Development Kit."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper documents changes from the original GeneGPT: removal of context truncation, explicit prompt format enforcement, expanded vocabulary mappings, and enhanced partial scoring mechanisms (Section 5)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 (Final Remarks and Future Work) contains substantive discussion of limitations including lack of ablation, benchmark scope limitation, and need for hybrid approaches."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The paper states specific limitations: '12% average improvement cannot be cleanly attributed to specific architectural choices without systematic ablation analysis' and 'evaluation is limited to the GeneTuring benchmark. This restricted scope prevents us from fully validating GenomAgent's generalizability.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states: evaluation limited to GeneTuring benchmark, no ablation analysis performed, no comparison with emerging frameworks like Chen et al. (2025). These are specific things not tested."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw output data (model responses, API call logs) is made available. Only aggregated scores are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The GeneTuring benchmark is well-described: 12 tasks, 50 QA pairs each, 9 tasks selected matching the original GeneGPT paper. The benchmark source (Hou & Ji, 2023) is cited."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from a standard public benchmark."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The processing pipeline is documented: query → Task Detection Agent → MCP Agent (parallel API calls) → Response Handler → Final Decision Agent. Evaluation protocol including metric choices is described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section states: 'partially supported by the HEREDITARY Project, as part of the European Union's Horizon Europe research and innovation programme under grant agreement No GA 101137074.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: University of Padua (Italy) and Aalto University (Finland). No commercial affiliations."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The EU Horizon Europe HEREDITARY project is a research grant with no commercial stake in GenomAgent's performance vs GeneGPT."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Disclosure of Interests section states: 'The authors have no competing interests to declare that are relevant to the content of this article.'"
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "GPT-4o-mini is used but its training data cutoff is not stated. The GeneTuring benchmark (2023) could be in the training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether GPT-4o-mini's training data includes GeneTuring benchmark questions or answers."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "GeneTuring was published in 2023 (BioRxiv). GPT-4o-mini was trained after this date. No contamination analysis is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 2 reports per-category and total costs for both GenomAgent ($2.11 total) and all GeneGPT configurations ($10.06-$16.76), computed from token counts and real model pricing."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "While API costs are reported, no total compute budget, GPU hours, or wall-clock time is stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated. It appears each configuration was run once."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described or budgeted despite agent configurations and prompt designs being choices."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "GenomAgent appears to be a single configuration. No description of how design choices were selected or whether alternatives were tried."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement both GenomAgent and the reproduced GeneGPT baseline. No acknowledgment of author-evaluation bias in baseline reimplementation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Table 2 and Figure 2 explicitly compare performance against cost for all systems, showing the performance-cost tradeoff."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The GeneTuring benchmark is used without discussion of whether its 50-question QA format adequately measures genomic QA capability or whether it represents real-world genomic research needs."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "GenomAgent uses a completely different architecture (multi-agent on Google ADK with GPT-4o-mini) compared to GeneGPT (single-agent with deprecated models). The improvement is attributed to the multi-agent architecture, but the model, framework, and APIs all differ simultaneously. The authors acknowledge this in limitations but do not address it experimentally."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "GeneTuring (2023) predates GPT-4o-mini's training. No discussion of whether the model has seen the benchmark questions during training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup (API access to NCBI, HGNC, UCSC) provides information that constitutes feature leakage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between any in-context learning examples and test questions."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "GenomAgent achieves an average performance score of 0.93, a 12% improvement over GeneGPT's best score of 0.83.",
    365       "evidence": "Table 2 shows per-task scores across 9 GeneTuring tasks. GenomAgent scores 0.98 on nomenclature, 0.98 on genomic location, 0.89 on functional analysis, and 0.85 on sequence alignment.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "GenomAgent reduces computational cost by 79% ($2.11 vs $10.06).",
    370       "evidence": "Table 2 reports per-category costs computed from token counts and model pricing. GenomAgent total is $2.11 vs GeneGPT Slim's $10.06.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Sequence alignment tasks see the largest improvement (28.8%).",
    375       "evidence": "Table 2 shows GenomAgent scores 0.85 on sequence alignment vs GeneGPT Slim's 0.66, a 28.8% relative improvement.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "GeneGPT's turbo configuration shows degraded performance when reproduced with GPT-4o-mini due to stop-token incompatibility.",
    380       "evidence": "Table 1 shows relative differences ranging from -83.33% (DNA to Human) to +6.25% (Gene Alias). Error analysis identifies E2 (stop-token parsing failures) as the dominant error type.",
    381       "supported": "strong"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "Confounded comparison",
    387       "detail": "GenomAgent changes the model (GPT-4o-mini vs deprecated Codex/GPT-3.5), architecture (multi-agent vs single-agent), data sources (multi-database vs single), and framework (Google ADK vs custom) simultaneously. The 12% improvement cannot be attributed to any specific change, as the authors themselves acknowledge."
    388     },
    389     {
    390       "flag": "No error bars or variance on single-run results",
    391       "detail": "All results appear to be from single runs with 50 questions per task. LLM outputs are stochastic, and with N=50 per task, point estimates could vary substantially across runs. No variance or confidence intervals are reported."
    392     },
    393     {
    394       "flag": "Evaluation protocol changes",
    395       "detail": "GenomAgent uses 'expanded vocabulary mappings' and 'enhanced partial scoring' compared to GeneGPT's original protocol. While the paper states these were applied to both systems, these changes could systematically favor the multi-source system that returns more diverse nomenclature."
    396     },
    397     {
    398       "flag": "Benchmark contamination risk",
    399       "detail": "GeneTuring was published in 2023 on BioRxiv. GPT-4o-mini was trained after this date and may have seen the benchmark questions during training. No contamination analysis is performed."
    400     }
    401   ],
    402   "cited_papers": [
    403     {
    404       "title": "GeneGPT: Augmenting large language models with domain tools for improved access to biomedical information",
    405       "authors": ["Qiao Jin", "Yifan Yang", "Qingyu Chen", "Zhiyong Lu"],
    406       "year": 2024,
    407       "relevance": "State-of-the-art system for tool-augmented LLM genomic QA that this paper replicates and extends."
    408     },
    409     {
    410       "title": "Why do multi-agent LLM systems fail?",
    411       "authors": ["Mert Cemri"],
    412       "year": 2025,
    413       "arxiv_id": "2503.13657",
    414       "relevance": "Analyzes failure modes of multi-agent LLM systems, directly relevant to agentic AI evaluation methodology."
    415     },
    416     {
    417       "title": "Beyond GeneGPT: A multi-agent architecture with open-source LLMs for enhanced genomic question answering",
    418       "authors": ["H. Chen", "G. Zuccon", "T. Leelanupab"],
    419       "year": 2025,
    420       "relevance": "Concurrent work on multi-agent genomic QA with open-source LLMs, relevant as a contemporary baseline."
    421     },
    422     {
    423       "title": "Evaluating large language models trained on code",
    424       "authors": ["Mark Chen"],
    425       "year": 2021,
    426       "arxiv_id": "2107.03374",
    427       "relevance": "Codex paper used as the base model in original GeneGPT; foundational to code-generating LLM evaluation."
    428     },
    429     {
    430       "title": "The landscape of emerging AI agent architectures for reasoning, planning, and tool calling: A survey",
    431       "authors": ["T. Masterman"],
    432       "year": 2024,
    433       "arxiv_id": "2404.11584",
    434       "relevance": "Survey of AI agent architectures relevant to understanding single vs multi-agent system design."
    435     },
    436     {
    437       "title": "ReAct: Synergizing reasoning and acting in language models",
    438       "authors": ["Shunyu Yao"],
    439       "year": 2023,
    440       "relevance": "ReAct framework used in GeneGPT's lang configuration; foundational agentic reasoning method."
    441     },
    442     {
    443       "title": "Language models are few-shot learners",
    444       "authors": ["Tom Brown"],
    445       "year": 2020,
    446       "relevance": "GPT-3 paper establishing in-context learning, the foundational technique used by GeneGPT."
    447     },
    448     {
    449       "title": "LLM with tools: A survey",
    450       "authors": ["Zhiheng Shen"],
    451       "year": 2024,
    452       "arxiv_id": "2409.18807",
    453       "relevance": "Survey of tool-augmented LLMs directly relevant to the tool-integration approach evaluated here."
    454     }
    455   ]
    456 }

Impressum · Datenschutz