scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26075B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Single to Multi-Agent Reasoning: Advancing GeneGPT for Genomics QA",
      6     "authors": [
      7       "Kimia Abedini",
      8       "Farzad Shami",
      9       "Gianmaria Silvello"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2601.10581",
     14     "doi": "10.48550/arXiv.2601.10581"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "Abstract claims GenomAgent 'extends beyond genomics to various scientific domains' but all evaluation is limited to GeneTuring genomics benchmark only. Two of three major abstract claims are supported (12% improvement, 79% cost reduction), but the generalization claim is entirely unsupported.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Paper makes causal claims ('GenomAgent improves performance') but authors explicitly state in Section 6: 'the 12% average improvement cannot be cleanly attributed to specific architectural choices without systematic ablation analysis.' No ablation studies provided.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Abstract claims extend to 'various scientific domains' but evaluation is strictly limited to 9 GeneTuring genomics tasks. Section 6 acknowledges 'evaluation is limited to GeneTuring benchmark' prevents validation of broader claims.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Multiple factors changed simultaneously between GeneGPT reproduction and GenomAgent (model version, implementation framework, prompting modifications) but no analysis discusses which factors drive improvements or considers alternative explanations.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Metrics (exact match accuracy, recall, partial scoring) are appropriate proxies for QA capability on genomics tasks. Computational cost measured directly, not as proxy.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations section. Section 6 'Final Remarks and Future Work' briefly mentions limitations scattered throughout (limited benchmark scope, missing ablation studies, generalizability constraints) but this is not a systematic threats-to-validity discussion.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "For GeneGPT reproduction, three error types identified (E1 incomplete data, E2 parsing failures, E3 context loss). For GenomAgent itself, no specific failure analysis provided. General acknowledgment that 12% improvement 'cannot be cleanly attributed' but no systematic threats analysis.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Paper states evaluation limited to GeneTuring benchmark (9 of 12 tasks) but abstract claims extension to 'various scientific domains' without supporting evidence. Scope boundaries contradict abstract claims.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Funding explicitly disclosed in Acknowledgments: 'partially supported by the HEREDITARY Project, as part of the European Union's Horizon Europe research and innovation programme under grant agreement No GA 101137074.'",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations disclosed (University of Padua, Aalto University). No apparent affiliation with NCBI, HGNC, UCSC databases or OpenAI.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "EU Horizon Europe funding is independent research funding, not company self-evaluation. Funder independence appropriate.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Explicit conflicts of interest statement: 'The authors have no competing interests to declare that are relevant to the content of this article.'",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Key terms used without precise definition: 'multi-agent framework' (not formally defined), 'in-context learning' (used but not explained), 'ReAct framework' (referenced but not explained), 'GenomAgent' (described but definition emerges gradually).",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Paper explicitly states three contributions: (1) GeneGPT reproducibility study with newer models, (2) GenomAgent multi-agent architecture, (3) performance and cost improvements. These are clearly articulated.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Limited engagement with prior work. No dedicated related work section. Primary engagement is with GeneGPT. Mentions multi-agent advances [4] but insufficient coverage of LLM tool-use, prompt engineering, or agent orchestration literature.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "Project website URL provided (https://kimia-abedini.github.io/Genom-Agent/) but no explicit statement that source code is released or available for download. No GitHub repository confirmed.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Evaluation uses GeneTuring benchmark (presumably public) but paper does not explicitly confirm it is publicly available or released. No statement about releasing experiment queries, results, or logs.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No environment specifications provided: no requirements.txt, Dockerfile, or dependency versions. Only mentions 'Google Agent Development Kit' and 'GPT-4o-mini' without version snapshots or implementation language specification.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions provided. System architecture described but implementation details for setup, configuration, and execution are missing. Cannot reproduce from paper alone.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No confidence intervals or error bars reported. Tables 1 and 2 show single point estimates per task with no variance bounds. No discussion of measurement uncertainty.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "Comparative claims made throughout (GenomAgent vs GeneGPT) but no statistical significance tests, p-values, or hypothesis tests reported. Differences not tested for statistical significance.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes reported: 12% average improvement, 28.8% sequence alignment improvement, 79% cost reduction. Improvements expressed with baseline context in Table 2.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "GeneTuring has 12 tasks with 50 pairs each (600 total), but paper evaluates only 9 tasks (~450 pairs). No justification for task selection or power analysis provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Only single runs reported. No standard deviations, confidence intervals, or run-to-run variance shown. Results presented as point estimates without uncertainty quantification.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Multiple GeneGPT configurations (Full, Slim, Turbo, Lang) serve as baselines. GenomAgent compared directly against these configurations.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "GeneGPT (2024) is the stated state-of-the-art and contemporary. Section 6 notes that more recent frameworks [5] from 2025 exist but were not compared in this work.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "No ablation study provided. Authors explicitly state in Section 6: '12% average improvement cannot be cleanly attributed to specific architectural choices without systematic ablation analysis.' Multiple components changed simultaneously (source diversity, modular agents, dynamic extraction).",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple metrics used: exact match accuracy (nomenclature), recall (associations), partial scoring (alignment), plus computational cost ($) reported for all systems.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "No human evaluation of system outputs. All evaluation is automatic based on benchmark metrics (exact match, recall, partial scoring).",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "GeneTuring benchmark used but paper does not explicitly state whether results are on held-out test set vs entire benchmark. Benchmark structure not confirmed in paper.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results broken down by four task categories (Nomenclature, Genomic Location, Functional Analysis, Sequence Alignment) with per-subtask results shown in Table 1.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "Error analysis provided for reproduced GeneGPT (E1/E2/E3 error types) but no failure case discussion for GenomAgent. Table 2 shows residual errors (0.85 alignment score) but reasons not analyzed.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "All GenomAgent results show improvements relative to GeneGPT. No negative results or failure conditions reported for the proposed system. Only GeneGPT turbo degradation reported.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Model identified by marketing name (GPT-4o-mini) without version snapshot date. No date specified for when model snapshot was used, preventing future replication with potentially updated versions.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Paper describes prompting strategies (API documentation, examples, stop tokens) but actual prompts and system instructions are not provided. Templates with placeholders not shown, preventing reproduction.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No hyperparameter values reported: temperature not specified, top-p not specified, agent configuration parameters not given. Evaluation modifications mentioned but values not provided.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Multi-agent architecture described with 4 core agents and 3 utility agents. Figure 1 shows workflow. Processing pipelines for JSON vs HTML responses documented at high level.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "Some preprocessing mentioned: 'vocabulary mappings' for NCBI species, 'partial scoring mechanisms' for evaluation. Full data pipeline from collection to analysis not documented. Agent input preparation not described.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No statement that raw data (API responses, query logs, GeneTuring questions/answers) is available. GeneTuring benchmark used but accessibility not confirmed.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "Paper uses existing GeneTuring benchmark but does not describe how GeneTuring was constructed or collected. No new data collection described for this work.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants involved. Evaluation is fully automated on benchmark tasks.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "High-level pipeline shown in Figure 1 (Query → Task Detection Agent → Processing → Final Decision Agent). Missing: detailed data flow between agents, error handling, retry logic, intermediate result transformations.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Training cutoff for GPT-4o-mini not stated. GeneTuring benchmark from 2023 [8]. No information provided to determine whether benchmark examples could have been in model training data.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of potential train/test overlap between GeneTuring (2023) and GPT-4o-mini training data. This is a significant concern for benchmark evaluation but entirely unaddressed.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No discussion of whether GeneTuring examples were available before GPT-4o-mini training cutoff or whether contamination is possible. Critical for model evaluation but not addressed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants involved.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human subjects involved.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human subjects involved.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human subjects involved.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Inference cost extensively reported in Table 2: GenomAgent $2.11 total, GeneGPT variants $10.06-$16.76. Per-task costs shown with token counting methodology explained.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Total inference computational budget stated: $2.11 for GenomAgent across all 9 tasks, broken down per task in Table 2. Development/training cost not mentioned.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "GenomAgent outperforms GeneGPT by 12% on average (0.93 vs 0.83)",
    373       "evidence": "Table 2: macro-averaged performance scores across 9 GeneTuring tasks",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Reduces computational costs by 79% ($2.11 vs $10.06)",
    378       "evidence": "Table 2: total cost across all tasks using actual OpenAI token pricing",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Sequence alignment tasks show 28.8% improvement",
    383       "evidence": "Table 2: 0.85 vs 0.66 = (0.85-0.66)/0.66 = 28.8%",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "GenomAgent's modular agents seamlessly adapt to new LLMs and evolving database schemas",
    388       "evidence": "No experiments demonstrating adaptation to other LLMs or database schemas provided",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "GeneGPT suffers from three bottlenecks: limited data coverage (E1), parsing failures (E2), context loss (E3)",
    393       "evidence": "Section 3 reproducibility study with manual error categorization of GeneGPT reproduction",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Extension beyond genomics to various scientific domains",
    398       "evidence": "None. Evaluation limited to GeneTuring genomics benchmark only",
    399       "supported": "unsupported"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "case-study",
    405     "observational"
    406   ],
    407   "key_findings": "GenomAgent achieves 12% average performance improvement over GeneGPT (0.93 vs 0.83 score) on GeneTuring genomics benchmark while reducing computational costs by 79% ($2.11 vs $10.06). The multi-agent architecture, which coordinates specialized agents for task detection, API coordination, response handling, and answer synthesis, shows largest gains on sequence alignment tasks (28.8% improvement). However, the authors acknowledge in Section 6 that the performance improvements 'cannot be cleanly attributed to specific architectural choices without systematic ablation analysis.'",
    408   "red_flags": [
    409     {
    410       "flag": "No ablation studies",
    411       "detail": "Authors explicitly state inability to attribute 12% improvement to specific components. Multiple architectural changes, implementation framework changes (LangGraph vs LangChain), and model version changes (GPT-4o-mini vs original Codex) occurred simultaneously, preventing causal attribution."
    412     },
    413     {
    414       "flag": "Overclaimed generalization",
    415       "detail": "Abstract claims work 'extends beyond genomics to various scientific domains' but all experiments confined to GeneTuring genomics benchmark. Section 6 acknowledges 'evaluation is limited to GeneTuring benchmark' prevents validation of claimed generalizability."
    416     },
    417     {
    418       "flag": "No statistical significance testing",
    419       "detail": "All performance claims lack confidence intervals, error bars, or significance tests. Only single point estimates reported without variance. No hypothesis tests for claimed improvements."
    420     },
    421     {
    422       "flag": "Incomplete reproduction materials",
    423       "detail": "No code repository confirmed available, no actual prompts provided (only descriptions), no hyperparameters specified, no version snapshots for models. Prevents independent reproduction despite modularity claims."
    424     },
    425     {
    426       "flag": "Evaluation scope unclear",
    427       "detail": "Only 9 of 12 GeneTuring tasks evaluated. Selection rationale not explained. Potential selection bias for favorable tasks."
    428     },
    429     {
    430       "flag": "Contamination risk unaddressed",
    431       "detail": "GPT-4o-mini training cutoff date not provided. GeneTuring from 2023. No analysis of whether benchmark could be in model training data, creating unfair advantage."
    432     },
    433     {
    434       "flag": "Baseline instability",
    435       "detail": "GeneGPT reproduction (Table 1) shows extreme variation: +416.67% on one task, -83.33% on another. Raises questions about baseline reliability and experimental conditions."
    436     },
    437     {
    438       "flag": "No error analysis for GenomAgent",
    439       "detail": "While GeneGPT errors systematically categorized (E1/E2/E3), no failure analysis for GenomAgent itself. Why does it score 0.85 on alignment (vs 0.98 on nomenclature)? Not explained."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "GeneGPT: Augmenting large language models with domain tools for improved access to biomedical information",
    445       "relevance": "Primary baseline system replicated and compared in this work"
    446     },
    447     {
    448       "title": "Why do multi-agent LLM systems fail?",
    449       "relevance": "Directly informs multi-agent architecture design and anticipated failure modes"
    450     },
    451     {
    452       "title": "Language models are few-shot learners",
    453       "relevance": "In-context learning approach foundational to both GeneGPT and GenomAgent"
    454     },
    455     {
    456       "title": "ReAct: Synergizing reasoning and acting in language models",
    457       "relevance": "ReAct framework implementation compared in GeneGPT lang configuration"
    458     },
    459     {
    460       "title": "The landscape of emerging AI agent architectures for reasoning, planning, and tool calling: A survey",
    461       "relevance": "Establishes design space for multi-agent coordination approaches"
    462     },
    463     {
    464       "title": "LLM with tools: A survey",
    465       "relevance": "Tool-augmented LLM approaches relevant to both systems"
    466     },
    467     {
    468       "title": "Found in the middle: Calibrating positional attention bias improves long context utilization",
    469       "relevance": "Addresses 'attention dilution' and context window limitations identified as GeneGPT bottleneck"
    470     },
    471     {
    472       "title": "GeneTuring tests gpt models in genomics",
    473       "relevance": "Describes GeneTuring benchmark used for all evaluations"
    474     }
    475   ],
    476   "engagement_factors": {
    477     "practical_relevance": {
    478       "score": 2,
    479       "justification": "79% cost reduction is practically significant, but lacks code/prompts for adoption. Domain-specific to genomics QA; limited broader applicability without reproduction materials."
    480     },
    481     "surprise_contrarian": {
    482       "score": 1,
    483       "justification": "Multi-agent systems outperforming single-agent systems on complex tasks is expected, not surprising. Genomics QA is a narrow domain with limited novelty."
    484     },
    485     "fear_safety": {
    486       "score": 0,
    487       "justification": "No AI safety concerns raised or discussed. Standard tool-use system for question answering on curated benchmarks poses no risk."
    488     },
    489     "demo_ability": {
    490       "score": 1,
    491       "justification": "Project website exists but code repository not confirmed available. Difficult for others to reproduce or build on without access to code/models/prompts."
    492     },
    493     "brand_recognition": {
    494       "score": 1,
    495       "justification": "University of Padua and Aalto University are respected institutions but not top-tier AI labs. EU Horizon Europe funding credible but not high-profile in AI community."
    496     },
    497     "drama_conflict": {
    498       "score": 0,
    499       "justification": "No drama or controversy. Straightforward technical improvement on narrow genomics benchmark. No adversarial framing or contested claims."
    500     }
    501   },
    502   "hn_data": {
    503     "threads": [
    504       {
    505         "hn_id": "47150074",
    506         "title": "Large-Scale Study of GitHub Pull Requests: How AI Coding Agents Modify Code",
    507         "points": 2,
    508         "comments": 0,
    509         "url": "https://news.ycombinator.com/item?id=47150074",
    510         "created_at": "2026-02-25T11:15:17Z"
    511       }
    512     ],
    513     "top_points": 2,
    514     "total_points": 2,
    515     "total_comments": 0
    516   }
    517 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs