scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (29926B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Single to Multi-Agent Reasoning: Advancing GeneGPT for Genomics QA",
      6     "authors": [
      7       "Kimia Abedini",
      8       "Farzad Shami",
      9       "Gianmaria Silvello"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2601.10581",
     14     "doi": "10.48550/arXiv.2601.10581"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims 12% average improvement and these numbers match Table 2 results (0.93 vs 0.83). The 79% cost reduction claim is also supported ($2.11 vs $10.06).",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper claims GenomAgent's multi-agent architecture 'addresses efficiency bottlenecks' and 'delivers superior accuracy,' but the authors themselves acknowledge the improvement 'cannot be cleanly attributed to specific architectural choices without systematic ablation analysis.' Multiple variables change simultaneously: model (GPT-4o-mini vs deprecated models), architecture (multi-agent vs single-agent), APIs (multi-source vs single-source), and evaluation protocol adjustments.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The abstract claims the 'flexible architecture extends beyond genomics to various scientific domains needing expert knowledge extraction' but testing is only on GeneTuring. The title says 'Genomics QA' but the generalization claim is unsupported. The paper acknowledges in limitations that evaluation is 'limited to the GeneTuring benchmark.'",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The improvement could be due to the model upgrade (GPT-4o-mini vs deprecated Codex/GPT-3.5), multi-source database access, or evaluation protocol changes (expanded vocabulary, partial scoring), rather than the multi-agent architecture itself. These confounds are not discussed.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures specific task-level metrics (exact match, recall, partial scoring) and reports them as task-level performance. The macro-average is explicitly acknowledged as 'a simplified view of the system's diverse capabilities across heterogeneous genomics QA tasks' (Section 2).",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6 (Final Remarks and Future Work) contains substantive discussion of limitations including lack of ablation, benchmark scope limitation, and need for hybrid approaches.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper states specific limitations: '12% average improvement cannot be cleanly attributed to specific architectural choices without systematic ablation analysis' and 'evaluation is limited to the GeneTuring benchmark. This restricted scope prevents us from fully validating GenomAgent's generalizability.'",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states: evaluation limited to GeneTuring benchmark, no ablation analysis performed, no comparison with emerging frameworks like Chen et al. (2025). These are specific things not tested.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Acknowledgments section states: 'partially supported by the HEREDITARY Project, as part of the European Union's Horizon Europe research and innovation programme under grant agreement No GA 101137074.'",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly listed: University of Padua (Italy) and Aalto University (Finland). No commercial affiliations.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The EU Horizon Europe HEREDITARY project is a research grant with no commercial stake in GenomAgent's performance vs GeneGPT.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Disclosure of Interests section states: 'The authors have no competing interests to declare that are relevant to the content of this article.'",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Key terms used without definition: 'multi-agent framework,' 'specialized agents,' 'domain-specific' appear throughout but lack precise definitions. What constitutes an 'agent' in their framework is never formally defined.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Contribution is explicit: GenomAgent multi-agent framework with two quantified results (12% performance improvement, 79% cost reduction) on GeneTuring benchmark.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Paper engages substantively: detailed review of GeneGPT (Section 2), cites multi-agent system advances, discusses limitations of single-agent approach relative to existing architecture paradigms.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "A project page URL is provided in the abstract: https://kimia-abedini.github.io/Genom-Agent/. This is a public URL for the project.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The evaluation uses the publicly available GeneTuring benchmark (Hou & Ji, 2023), comprising 12 tasks with 50 QA pairs each. The benchmark is a standard public dataset.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency versions are mentioned in the paper.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided in the paper. The project page is referenced but the paper itself contains no reproduction guide.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Tables 1 and 2 are point estimates with no confidence intervals or error bars.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper claims GenomAgent 'outperforms' GeneGPT based solely on comparing point scores (0.93 vs 0.83) with no statistical significance tests.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Percentage improvements are reported with baseline context throughout: '12% increase in average performance (0.93 vs. 0.83)', '79% reduction in computational cost ($2.11 vs. $10.06)', and per-category improvements in Table 2.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Each task has 50 QA pairs from the GeneTuring benchmark, but no justification is given for why this sample size is adequate for the claims made.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or multiple-run statistics are reported. Results appear to be from single runs.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Multiple GeneGPT configurations (Full, Slim, Turbo, Lang) serve as baselines, and the original paper's baselines (Bing Chat, BioMedLM, GPT-3) are also referenced.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The main baseline GeneGPT was published in 2024 and uses deprecated models (code-davinci-002, GPT-3.5-turbo-16k). The paper acknowledges a concurrent work (Chen et al., 2025, ref [5]) but only mentions it as future comparison rather than including it.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The paper explicitly acknowledges in Future Work (Section 6): 'the 12% average improvement cannot be cleanly attributed to specific architectural choices without systematic ablation analysis.' No ablation is performed.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple task-specific metrics are used: exact match accuracy for nomenclature, recall for association tasks, vocabulary-mapped matching for cross-species alignment, and partial scoring for human genome alignment.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "The authors manually reviewed and categorized all mistakes made by the reproduced GeneGPT system into three error types (E1, E2, E3) in Section 3.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The GeneTuring benchmark is a fixed test set of 50 QA pairs per task, not used for any tuning or development.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Tables 1 and 2 provide detailed per-task breakdowns across all 9 tasks grouped into 4 categories (nomenclature, genomic location, functional analysis, sequence alignment).",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 3 categorizes reproduced GeneGPT errors into E1 (incomplete data coverage), E2 (stop-token parsing failures), and E3 (context loss). However, GenomAgent's failures are not discussed.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The reproducibility study shows degraded performance in turbo settings (Table 1), including an 83.33% drop on DNA-to-Human alignment. The lang configuration showed massive variability.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "The paper states 'GPT-4o-mini' as the replacement model without any version date or API snapshot identifier.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "The paper describes prompting strategies and agent roles in natural language but does not provide actual prompt text. Agent-specific instructions are described functionally but not reproduced.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any model configuration.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Section 4 and Figure 1 describe the multi-agent architecture in detail: Task Detection Agent, MCP Agent, Response Handler Agent, Feature Extractor Agent, Code Writer Agent, Code Executor Agent, and Final Decision Agent, with their roles and interactions. Built on Google Agent Development Kit.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The paper documents changes from the original GeneGPT: removal of context truncation, explicit prompt format enforcement, expanded vocabulary mappings, and enhanced partial scoring mechanisms (Section 5).",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No raw output data (model responses, API call logs) is made available. Only aggregated scores are reported.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "The GeneTuring benchmark is well-described: 12 tasks, 50 QA pairs each, 9 tasks selected matching the original GeneGPT paper. The benchmark source (Hou & Ji, 2023) is cited.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Data comes from a standard public benchmark.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The processing pipeline is documented: query → Task Detection Agent → MCP Agent (parallel API calls) → Response Handler → Final Decision Agent. Evaluation protocol including metric choices is described.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "GPT-4o-mini is used but its training data cutoff is not stated. The GeneTuring benchmark (2023) could be in the training data.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether GPT-4o-mini's training data includes GeneTuring benchmark questions or answers.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "GeneTuring was published in 2023 (BioRxiv). GPT-4o-mini was trained after this date. No contamination analysis is performed.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Table 2 reports per-category and total costs for both GenomAgent ($2.11 total) and all GeneGPT configurations ($10.06-$16.76), computed from token counts and real model pricing.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "While API costs are reported, no total compute budget, GPU hours, or wall-clock time is stated.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The number of experimental runs is not stated. It appears each configuration was run once.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No hyperparameter search is described or budgeted despite agent configurations and prompt designs being choices.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "GenomAgent appears to be a single configuration. No description of how design choices were selected or whether alternatives were tried.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": false,
    395           "answer": false,
    396           "justification": "No statistical tests are performed, so multiple comparison correction is not applicable.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors implement both GenomAgent and the reproduced GeneGPT baseline. No acknowledgment of author-evaluation bias in baseline reimplementation.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": true,
    408           "justification": "Table 2 and Figure 2 explicitly compare performance against cost for all systems, showing the performance-cost tradeoff.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The GeneTuring benchmark is used without discussion of whether its 50-question QA format adequately measures genomic QA capability or whether it represents real-world genomic research needs.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": true,
    419           "answer": false,
    420           "justification": "GenomAgent uses a completely different architecture (multi-agent on Google ADK with GPT-4o-mini) compared to GeneGPT (single-agent with deprecated models). The improvement is attributed to the multi-agent architecture, but the model, framework, and APIs all differ simultaneously. The authors acknowledge this in limitations but do not address it experimentally.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "GeneTuring (2023) predates GPT-4o-mini's training. No discussion of whether the model has seen the benchmark questions during training.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether the evaluation setup (API access to NCBI, HGNC, UCSC) provides information that constitutes feature leakage.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of independence between any in-context learning examples and test questions.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No concrete leakage detection or prevention method is applied.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "GenomAgent achieves 0.93 average score on GeneTuring, 12% improvement over GeneGPT's 0.83",
    455       "evidence": "Table 2: GenomAgent 0.93 vs GeneGPT-Slim 0.83; macro-averaged across 9 tasks",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "GenomAgent reduces computational cost by 79%, from $10.06 to $2.11 total",
    460       "evidence": "Table 2 cost column; token tracking and OpenAI pricing applied uniformly",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "GenomAgent's multi-agent architecture addresses three critical bottlenecks: limited data coverage, parsing failures, context loss",
    465       "evidence": "Identified from GeneGPT reproducibility (E1, E2, E3); architecture claims in Section 4 but no ablation to prove causation",
    466       "supported": "weak"
    467     },
    468     {
    469       "claim": "Sequence alignment tasks show largest gains (28.8%) due to multi-source retrieval and adaptive partial scoring",
    470       "evidence": "Table 2: DNA-to-Human 0.42→0.85 (102.4%), DNA-to-Species 0.88→0.85 (−3.4%); causation not proven without ablation",
    471       "supported": "moderate"
    472     },
    473     {
    474       "claim": "GenomAgent's flexible architecture extends beyond genomics to various scientific domains",
    475       "evidence": "Abstract claim; zero evidence provided. Evaluation is genomics-only (GeneTuring benchmark)",
    476       "supported": "unsupported"
    477     },
    478     {
    479       "claim": "GeneGPT's stop-token mechanism (→) is incompatible with modern LLMs like GPT-4o-mini",
    480       "evidence": "Reproducibility study (Section 3): 'GPT-4o-mini did not consistently follow the URL generation format'; required explicit prompting",
    481       "supported": "moderate"
    482     },
    483     {
    484       "claim": "Parallel API querying in GenomAgent reduces context window constraints vs GeneGPT's sequential processing",
    485       "evidence": "Architectural description in Section 4; no empirical comparison of context usage or response latency",
    486       "supported": "weak"
    487     }
    488   ],
    489   "methodology_tags": [
    490     "benchmark-eval",
    491     "case-study"
    492   ],
    493   "key_findings": "GenomAgent, a multi-agent framework for genomics QA, achieves 0.93 average score on GeneTuring (12% above GeneGPT's 0.83) while reducing cost by 79% ($2.11 vs $10.06). Sequence alignment tasks see the largest absolute gains (28.8%). The framework addresses three limitations identified in GeneGPT: incomplete data coverage, stop-token parsing failures, and context loss in multi-turn queries. Despite strong empirical results, the paper lacks ablation studies to attribute improvements to specific architectural choices rather than model upgrade (Codex→GPT-4o-mini).",
    494   "red_flags": [
    495     {
    496       "flag": "No ablation study",
    497       "detail": "Authors explicitly acknowledge inability to attribute the 12% improvement to specific architectural choices. Improvement could partly result from using GPT-4o-mini instead of Codex/GPT-3.5-turbo."
    498     },
    499     {
    500       "flag": "Limited evaluation scope",
    501       "detail": "Single benchmark (GeneTuring), only 9 of 12 tasks. Claim about extending to 'various scientific domains' is unsupported—only genomics tested."
    502     },
    503     {
    504       "flag": "No variance or significance testing",
    505       "detail": "All results are point estimates with no error bars, confidence intervals, or statistical tests. No statement of variance across runs."
    506     },
    507     {
    508       "flag": "Unsupported generalization claims",
    509       "detail": "Abstract claims GenomAgent 'extends beyond genomics to various scientific domains' with zero evidence. This is a clear overgeneralization."
    510     },
    511     {
    512       "flag": "No failure case analysis for GenomAgent",
    513       "detail": "Paper analyzes GeneGPT errors (E1, E2, E3) but does not discuss where GenomAgent fails or what types of queries it struggles with."
    514     },
    515     {
    516       "flag": "No reproducibility",
    517       "detail": "Code not released (only website link, no explicit release statement). No environment specs (requirements.txt, Dockerfile). No prompts for agents. No hyperparameters."
    518     },
    519     {
    520       "flag": "Potential training data contamination",
    521       "detail": "GenETuring is a published benchmark (2023). GPT-4o-mini's training cutoff is not stated—no discussion of whether GeneTuring examples are in training data."
    522     },
    523     {
    524       "flag": "Modest baseline comparison",
    525       "detail": "Compares only to GeneGPT. No comparison to other multi-agent frameworks (e.g., LangChain agents, AutoGen) or recent genomics LLM systems."
    526     }
    527   ],
    528   "cited_papers": [
    529     {
    530       "title": "Why do multi-agent LLM systems fail?",
    531       "authors": "Cemri et al.",
    532       "year": 2025,
    533       "arxiv_id": "2503.13657",
    534       "relevance": "Foundational concern about multi-agent LLM reliability; directly motivates GenomAgent's design choices."
    535     },
    536     {
    537       "title": "The landscape of emerging AI agent architectures for reasoning, planning, and tool calling: A survey",
    538       "authors": "Masterman et al.",
    539       "year": 2024,
    540       "arxiv_id": "2404.11584",
    541       "relevance": "Survey of agent architectures; GenomAgent contributes a domain-specific orchestration pattern."
    542     },
    543     {
    544       "title": "React: Synergizing reasoning and acting in language models",
    545       "authors": "Yao et al.",
    546       "year": 2023,
    547       "relevance": "ReAct framework used in GeneGPT's lang configuration; GenomAgent extends multi-agent coordination beyond ReAct."
    548     },
    549     {
    550       "title": "GeneGPT: Augmenting large language models with domain tools for improved access to biomedical information",
    551       "authors": "Jin et al.",
    552       "year": 2024,
    553       "venue": "Bioinformatics",
    554       "relevance": "The baseline system being replicated and improved; core architecture for comparison."
    555     },
    556     {
    557       "title": "GenETuring tests gpt models in genomics",
    558       "authors": "Hou & Ji",
    559       "year": 2023,
    560       "venue": "BioRxiv",
    561       "relevance": "GeneTuring benchmark; the sole evaluation dataset used in this study."
    562     },
    563     {
    564       "title": "Language models are few-shot learners",
    565       "authors": "Brown et al.",
    566       "year": 2020,
    567       "venue": "NeurIPS 33",
    568       "relevance": "In-context learning paradigm; foundation for both GeneGPT and GenomAgent's prompt engineering."
    569     }
    570   ],
    571   "engagement_factors": {
    572     "practical_relevance": {
    573       "score": 2,
    574       "justification": "79% cost reduction is significant for production genomics QA. However, evaluation limited to one benchmark, requires paid GPT-4o-mini access, and genomics is a narrow domain."
    575     },
    576     "surprise_contrarian": {
    577       "score": 1,
    578       "justification": "Multi-agent coordination for QA is well-established; GeneGPT reproducibility failure is expected given model deprecation. No novel findings about genomics or LLMs."
    579     },
    580     "fear_safety": {
    581       "score": 0,
    582       "justification": "Genomics QA is a benign application. No AI safety, alignment, security, or existential risk concerns discussed or raised."
    583     },
    584     "drama_conflict": {
    585       "score": 0,
    586       "justification": "Straightforward technical engineering paper. No controversy, debate, or competing theories. Authors professionally acknowledge limitations."
    587     },
    588     "demo_ability": {
    589       "score": 1,
    590       "justification": "Website exists but code not released. API-dependent (GPT-4o-mini). Hard for researchers to reproduce or extend without code access."
    591     },
    592     "brand_recognition": {
    593       "score": 2,
    594       "justification": "University of Padua and Aalto University are respected but not top-tier ML labs. Genomics domain carries some authority but no FAANG/DeepMind/OpenAI affiliation."
    595     }
    596   },
    597   "hn_data": {
    598     "threads": [
    599       {
    600         "hn_id": "47150074",
    601         "title": "Large-Scale Study of GitHub Pull Requests: How AI Coding Agents Modify Code",
    602         "points": 2,
    603         "comments": 0,
    604         "url": "https://news.ycombinator.com/item?id=47150074",
    605         "created_at": "2026-02-25T11:15:17Z"
    606       }
    607     ],
    608     "top_points": 2,
    609     "total_points": 2,
    610     "total_comments": 0
    611   }
    612 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs