ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (26975B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Many AI Analysts, One Dataset: Navigating the Agentic Data Science Multiverse",
      6     "authors": [
      7       "Martin Bertran",
      8       "Riccardo Fogliato",
      9       "Zhiwei Steven Wu"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2602.18710",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims about wide dispersion in effect sizes/p-values, steerable outcomes via persona, and structured analytic choices are all substantiated by the experimental results across three datasets and four models.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper makes causal claims about persona driving conclusions; the experimental design (randomized persona assignment across ~30 runs per cell) supports causal inference for this manipulation.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The conclusion broadly frames findings as implications for 'an AI-automated future of empirical science,' but evidence comes from only three specific social-science datasets and four LLMs, with no explicit bounding of what remains untested.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not address whether observed persona-driven dispersion could be primarily attributable to stochastic sampling at T=1.0 rather than persona content itself; temperature is noted as intentional but not treated as an alternative explanation for the persona effect.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper clearly distinguishes between what was measured (binary hypothesis-support decisions and effect size distributions from AI analysts) and the broader phenomenon of interest (analytical variability in empirical science).",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations or threats-to-validity section; constraints are embedded in the combined Conclusion and Discussion section.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper specifically discusses LLM training data contamination as a validity concern (soccer dataset), auditor reliability issues, and the problem that any definition of 'reasonable' analysis depends on chosen standards.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly state what its results do NOT show; boundaries around generalizability to other domains, data types, or AI systems beyond the three tested datasets are not systematically articulated.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source is disclosed; the acknowledgment section thanks colleagues by name but contains no mention of grants or institutional funding.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly listed in footnotes: Amazon AWS and Carnegie Mellon University.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funder is disclosed, so independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "There is no competing interests or financial disclosure statement anywhere in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined: 'AI analyst' is specified as a tool-using ReAct agent, 'persona' is defined with exact prompts in Appendix A, and 'many-analyst study' is explained in relation to prior literature.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper explicitly states its contribution: demonstrating that AI analysts can replicate many-analyst study diversity cheaply, and proposing a framework for treating analysis outputs as distributions to address selective reporting.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 extensively engages with garden-of-forking-paths literature (Gelman & Loken), prior many-analyst studies (Silberzahn, Breznau), multiverse analysis (Steegen, Simonsohn), and agentic data science benchmarks (BLADE, DS-1000).",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No repository or code release is mentioned; the paper references the Inspect AI framework as the scaffold but does not release the specific research code implementing the experimental pipeline.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The 4,946 AI analyst runs and their structured outputs are not mentioned as publicly released; while the underlying source datasets (ANES, soccer) are publicly available, the primary research data (generated analyses) are not.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions 'standard data-science libraries' and the Inspect AI framework but provides no requirements file, version specifications, or container definition.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step instructions for reproducing the experimental setup are provided; the appendix provides prompts but not how to instantiate the agent scaffold or run the pipeline.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "95% confidence intervals are shown for each individual analyst run in the specification curves (Figures 1, 4, 5), constituting a primary display of results.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No formal statistical tests are used to compare hypothesis support rates across persona or model conditions; all group differences are reported as descriptive percentage-point gaps.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes are reported both for analyst-level estimates (standardized coefficients, risk differences) and for persona comparisons (34-66 percentage-point differences in support rates).",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "'We target approximately 30 independent compliant runs per cell' with no power analysis or justification for why 30 is sufficient to detect the anticipated effect sizes.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Specification curves display the full distribution of effect size estimates across runs, and p-value distribution plots (Figure 3) show spread across conditions.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The Standard (neutral) persona serves as a baseline condition against which Negative, Positive, CS, and Strong CS personas are compared.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "All four LLMs tested (Claude Sonnet 4.5, Claude Haiku 4.5, Qwen3 Coder 480B, Qwen3 235B A22B) are state-of-the-art models available in early 2026.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The study systematically varies one factor at a time (persona held constant while varying model; model held constant while varying persona), enabling attribution of dispersion to each factor.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple outcome metrics are used: binary hypothesis support rate, p-value distribution, standardized effect size, exclusion/failure rate, and qualitative coding of analytic decisions.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Two authors manually reviewed a stratified subset of runs and found no major disagreements with the automated auditor's verdicts.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "The study is not a prediction task requiring a held-out test set; it examines the distribution of analytical choices across experimental conditions.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down by dataset (soccer, metr-rct, anes-views), persona condition, and model in Figure 2 and Table 2.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "The paper discusses hallucination failures (confident reports without reading data), soccer dataset contamination (analysts recalling results from training), and elevated exclusion rates under confirmation-seeking personas.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper explicitly reports that auditor-based filtering does not eliminate dispersion — a negative result for quality control as a solution to the analytical variability problem.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Specific model versions are stated: Claude Sonnet 4.5, Claude Haiku 4.5, Qwen3 Coder 480B, Qwen3 235B A22B, with references to Anthropic system cards.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Appendix A provides the full verbatim prompt for each of the five analyst personas and the AI auditor, including exact deltas between variants.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Sampling temperature T=1.0 is reported and its deliberate choice explained; turn limit (250 messages) and time limit (60 minutes per run) are also specified.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Section 3.2 describes the ReAct agent scaffold: tool-using agents in the Inspect AI framework with a persistent Python session, stateful shell, and file editor.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The paper documents how structured analytic decisions were extracted (iterative LLM-based codebooks developed independently by two authors and then merged), and describes the auditor's transcript-based evaluation pipeline.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "The 4,946 AI analyst run transcripts, structured reports, and extracted analytic decisions are not mentioned as being publicly released.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3 describes the data collection procedure in detail: analyst agent setup, per-run protocol, compliance criteria, and targeting ~30 compliant runs per (dataset × model × persona) cell.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants were recruited; AI analysts were instantiated programmatically.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline is documented: task delivery to analyst agent → agent execution → auditor validation → structured decision extraction → statistical analysis.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Despite explicitly discussing data contamination as a key methodological concern, the paper does not state the training data cutoff dates for any of the four tested models.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": true,
    300           "justification": "The paper explicitly discusses data contamination as a key variable across the three datasets, categorizing soccer as high-contamination and metr-rct as low-contamination, with pilot run evidence of contamination effects.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": true,
    306           "justification": "The paper directly addresses soccer dataset contamination (analysts reproducing results from training data before inspecting the data), and deliberately selected metr-rct as a recent paper to minimize this confound.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants; pre-registration not applicable.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants; IRB approval not applicable.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants; demographics not applicable.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants; inclusion/exclusion criteria not applicable.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants; randomization of human subjects not applicable.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants; blinding not applicable.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants; attrition not applicable.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No dollar cost or token cost for running 4,946 analyst sessions (each capped at 250 messages or 60 minutes) is reported.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total compute budget, API cost estimate, or GPU hours are stated anywhere in the paper.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "AI analysts display wide dispersion in effect sizes, p-values, and binary hypothesis-support decisions across all three tested datasets.",
    373       "evidence": "Specification curves (Figures 1, 4, 5) show estimates spanning negative to positive across valid runs; support rates vary substantially within persona conditions.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Analyst persona framing systematically shifts hypothesis support rates by 34 to 66 percentage points across datasets.",
    378       "evidence": "Figure 2 shows support rates stratified by persona, model, and dataset; comparing Negative to Strong CS yields stated differences for all three datasets.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Dispersion persists after AI auditor filtering, indicating the auditor cannot eliminate methodologically defensible but divergent analyses.",
    383       "evidence": "Figure 3 (bottom panel) shows persistent p-value distribution separation between personas even after compliance filtering.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Specific analytic choices (outlier removal, survey weighting, standard error clustering) mediate the persona effect on conclusions.",
    388       "evidence": "Section 4.2 identifies per-dataset mechanisms: outlier removal in metr-rct, weighting in anes-views, clustering in soccer; supported by strike plots in Appendix C.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "The soccer dataset constitutes a high-contamination benchmark where AI analysts recall conclusions from training before inspecting data.",
    393       "evidence": "Directly observed in pilot runs: 'we frequently observed AI analysts reproducing qualitative conclusions before inspecting the data' (Section 3.1).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Treating analysis outputs as distributions rather than point estimates provides a principled response to AI-generated analytical variability.",
    398       "evidence": "This is a normative framework proposal in the conclusion; the paper demonstrates the feasibility of generating distributions but does not evaluate whether this approach improves downstream inference quality.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "observational",
    405     "case-study"
    406   ],
    407   "key_findings": "AI analysts (LLM-based ReAct agents) running autonomously on identical datasets and hypotheses exhibit wide dispersion in effect sizes, p-values, and binary hypothesis-support decisions, mirroring the signature of human many-analyst studies. This dispersion is steerable: varying the analyst persona shifts support rates by 34-66 percentage points across three datasets, with specific analytic choices (outlier removal, survey weighting, standard error clustering) as the mediating mechanisms. AI auditor filtering reduces but does not eliminate the dispersion. The findings highlight a structural vulnerability to selective reporting as AI lowers the cost of generating many defensible analyses from a single dataset.",
    408   "red_flags": [
    409     {
    410       "flag": "No formal tests on group differences",
    411       "detail": "Persona-level differences in hypothesis support rates are described only as point estimates; no confidence intervals or significance tests are reported for the main comparative claims, making it impossible to assess whether observed differences exceed chance."
    412     },
    413     {
    414       "flag": "High-contamination benchmark not isolated",
    415       "detail": "The soccer dataset is acknowledged as highly contaminated (LLMs likely recall conclusions from training), yet results are reported alongside low-contamination datasets without separating contamination-affected from clean analyses."
    416     },
    417     {
    418       "flag": "Auditor uses same model family as one analyst",
    419       "detail": "The AI auditor uses Claude Sonnet 4.5, the same family as one of the analyst models being evaluated, creating potential systematic bias in how Claude vs. Qwen analysts are judged for compliance."
    420     },
    421     {
    422       "flag": "No code or data release",
    423       "detail": "Neither the experimental framework code nor the 4,946 generated analyst runs are released, making independent reproduction impossible despite reproducibility being a central theme of the paper."
    424     },
    425     {
    426       "flag": "Temperature confounded with persona",
    427       "detail": "All analysts run at T=1.0; the paper does not demonstrate that persona content (rather than distributional shifts from different prompt lengths/tokens) drives observed outcome differences."
    428     },
    429     {
    430       "flag": "Sample size unjustified",
    431       "detail": "The ~30 runs per cell target is stated without a power analysis; it is unclear whether this is sufficient to reliably detect persona effects of the stated magnitude."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Many analysts, one data set: Making transparent how variations in analytic choices affect results",
    437       "relevance": "Silberzahn et al. 2018 — foundational many-analyst study on soccer referees that this paper directly replicates using AI analysts; provides baseline for human analytical variability."
    438     },
    439     {
    440       "title": "Observing many researchers using the same data and hypothesis reveals a hidden universe of uncertainty",
    441       "relevance": "Breznau et al. 2022 — 73-team many-analyst study on immigration policy; central reference for the magnitude of human analytical variability this paper aims to replicate cheaply."
    442     },
    443     {
    444       "title": "BLADE: Benchmarking Language Model Agents for Data-Driven Science",
    445       "relevance": "Gu et al. 2024 — closely related multiverse-inspired agent evaluation; the paper explicitly positions its contribution relative to BLADE."
    446     },
    447     {
    448       "title": "Specification curve analysis",
    449       "relevance": "Simonsohn et al. 2020 — multiverse/specification curve methodology underpinning the paper's main visualization and analytical approach."
    450     },
    451     {
    452       "title": "Increasing transparency through a multiverse analysis",
    453       "relevance": "Steegen et al. 2016 — introduced multiverse analysis as a tool for making analytic multiplicity visible, directly motivating this work's framework."
    454     },
    455     {
    456       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    457       "relevance": "Becker et al. 2025 (METR RCT) — one of the three datasets used in the experiments; a recent low-contamination randomized controlled trial on AI programming assistance."
    458     },
    459     {
    460       "title": "Nonstandard errors",
    461       "relevance": "Menkveld et al. 2024 — formalizes analyst-induced uncertainty as a component of statistical error, providing theoretical grounding for the paper's framework."
    462     },
    463     {
    464       "title": "The garden of forking paths: Why multiple comparisons can be a problem",
    465       "relevance": "Gelman & Loken 2013 — coined the 'garden of forking paths' concept that the paper operationalizes through AI-generated multiverse analyses."
    466     },
    467     {
    468       "title": "ReAct: Synergizing reasoning and acting in language models",
    469       "relevance": "Yao et al. 2022/2023 — the ReAct agent architecture used to implement all AI analysts in this study."
    470     },
    471     {
    472       "title": "Variability in the analysis of a single neuroimaging dataset by many teams",
    473       "relevance": "Botvinik-Nezer et al. 2020 — major many-analyst study in neuroimaging, cited as evidence that analytical variability extends beyond social science into pipeline-heavy domains."
    474     }
    475   ],
    476   "engagement_factors": {
    477     "practical_relevance": {
    478       "score": 3,
    479       "justification": "Directly relevant to any researcher, data scientist, or organization using AI for analysis — demonstrates that AI analysts can be steered toward desired conclusions with simple prompt changes."
    480     },
    481     "surprise_contrarian": {
    482       "score": 3,
    483       "justification": "Challenges the implicit assumption that AI analysts are more objective than humans, showing they can be manipulated to reverse conclusions on the same data via persona framing."
    484     },
    485     "fear_safety": {
    486       "score": 2,
    487       "justification": "Raises concrete concerns about AI enabling cheap selective reporting and p-hacking at scale, particularly when empirical findings inform policy or regulation."
    488     },
    489     "drama_conflict": {
    490       "score": 2,
    491       "justification": "The strong confirmation-seeking persona explicitly encouraging p-hacking and the demonstration of 66pp swings in support rates creates a provocative narrative about AI-enabled scientific misconduct."
    492     },
    493     "demo_ability": {
    494       "score": 1,
    495       "justification": "Uses publicly available models and datasets, but the specific framework code is not released, limiting immediate reproducibility by others."
    496     },
    497     "brand_recognition": {
    498       "score": 2,
    499       "justification": "Uses Claude (Anthropic) and Qwen (Alibaba) models; authors are from Amazon AWS and CMU, adding institutional visibility."
    500     }
    501   },
    502   "hn_data": {
    503     "threads": [],
    504     "top_points": 0,
    505     "total_points": 0,
    506     "total_comments": 0
    507   }
    508 }

Impressum · Datenschutz