scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26330B)
      1 {
      2   "paper": {
      3     "title": "Many AI Analysts, One Dataset: Navigating the Agentic Data Science Multiverse",
      4     "authors": ["Martin Bertran", "Riccardo Fogliato", "Zhiwei Steven Wu"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.18710"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Fully autonomous AI analysts built on LLMs reproduce the defining signature of many-analyst studies: wide dispersion in effect sizes, p-values, and binary hypothesis support across independent runs on the same dataset. This dispersion is steerable—varying analyst persona or base model shifts outcome distributions by 34–66 percentage points even after auditor-based filtering. Across 4,946 runs (3,303 compliant) on three datasets, confirmation-seeking personas produce systematically lower p-values and higher support rates, with recognizable analytic choices (weighting, outlier removal, clustering) driving the differences.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The underlying datasets (soccer, METR RCT, ANES) are public, but the paper does not release its own generated transcripts, analyst outputs, or extracted decision codebooks."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions the Inspect AI framework and specific LLMs but provides no requirements.txt, Dockerfile, or versioned dependency list."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The experimental setup is described but not in a runnable form."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "95% confidence intervals are shown in the specification curves (Figures 1, 4, 5) for each AI analyst-produced analysis."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper reports percentage point differences in support rates across personas (34–66 pp) but does not apply any formal statistical test to its own comparisons between conditions. Claims like 'persona choice alone produces up to a 47 percentage point swing' are based on raw comparisons."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Support rate differences are reported with context: 'comparing the most skeptical (Negative) to the most confirmation-seeking (Strong CS) persona yields support rate differences ranging from 34 percentage points (anes-views) to 66 percentage points (metr-rct)' (Section 4.2)."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper targets ~30 runs per cell but provides no power analysis or justification for why 30 is sufficient. Total N=4,946 runs (3,303 compliant) is stated but not justified."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Distributions of p-values (Figure 3), effect sizes (Figures 1, 4, 5), and support rates (Figure 2) are shown, making variance visible across runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The Standard (neutral) persona serves as the baseline condition against which Negative, Positive, CS, and Strong CS personas are compared."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Contemporary models used: Claude Sonnet 4.5, Haiku 4.5, Qwen3 Coder 480B, Qwen3 235B A22B — all recent at time of writing."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Persona and model are systematically varied as independent variables, functioning as a factorial experiment that isolates the contribution of each factor to outcome variation."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple outcome measures reported: hypothesis support rate, p-value distributions, effect size distributions, exclusion rates (Table 2), and extracted analytic decision patterns."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "'Two authors manually reviewed a stratified subset of runs and found no major disagreements that would change substantive conclusions with respect to the default full-transcript auditor' (Section 3.3)."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "The study is not a train/test ML evaluation — it studies analytical variability across agent runs. No held-out set concept applies."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by dataset (3 datasets), persona (5 conditions), and model (4 models) in Table 2 and Figure 2."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Exclusion patterns are discussed in detail: hallucinated results in pilot runs, truncated reports (38% for Qwen), variable misidentification (42% for Qwen), and confirmation-seeking personas showing elevated exclusion rates (Section 4.1)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that auditing 'does not eliminate the dispersion' and that Qwen3 Coder 480B had 48% exclusion rate overall and 82–84% for CS personas (Table 2)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about wide dispersion, structured analytic diversity, and steerability are all supported by Figures 1–5, Table 2, and the specification curve analyses in Sections 4.1–4.2."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper claims persona and model choice 'shift' outcome distributions. Since persona is a manipulated independent variable in a controlled experiment (holding dataset and estimand fixed), causal language is appropriate."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The abstract states 'These results highlight a central challenge for an AI-automated future of empirical science' — generalizing from 4 LLMs and 3 datasets to all of empirical science. The paper does not explicitly bound its claims to the tested models or task types."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper discusses contamination as an alternative explanation (soccer dataset results may reflect training data recall, not analysis), model-specific failure modes (Qwen truncation vs. deliberate choices), and notes that auditor filtering may itself introduce bias (Section 5)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper directly measures what it claims: analytical variability in effect sizes, p-values, and support decisions. It also explicitly fixes estimands to ensure comparability, acknowledging this 'restricts the portion of the multiverse that analysts can explore' (Section 3.1)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models listed as 'Claude Sonnet 4.5', 'Haiku 4.5', 'Qwen3 Coder 480B', 'Qwen3 235B A22B' — marketing names without snapshot dates or API version identifiers."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompt text for all five analyst personas and the auditor system/user prompts are provided in Appendix A, including the exact modifications for each variant."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "'All AI analysts use sampling temperature T = 1.0, chosen deliberately so that stochastic sampling contributes to analytic diversity, and are capped at 250 messages or 60 minutes per run' (Section 3.2)."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "'Analysts are implemented as tool-using ReAct agents in the Inspect AI framework. Each analyst has access to a persistent Python session with standard data-science libraries, a stateful shell, and a file editor' (Section 3.2)."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The auditing pipeline is documented: compliance screening criteria (validity verdict + required outputs), exclusion rates by model and persona (Table 2), and the two-criterion filter (Section 4.1)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. Some limitations are discussed within the Conclusion and Discussion (Section 5), but they are embedded in the broader discussion rather than receiving substantive standalone treatment."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: soccer dataset contamination ('AI analysts reproducing qualitative conclusions before inspecting the data'), auditor limitations ('any definition of a reasonable analysis depends on chosen analytic standards'), and Qwen-specific failure modes (Section 4.1)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what it did NOT test or claim. It generalizes to 'an AI-automated future of empirical science' without bounding to the tested models, domains, or task types."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No release of raw transcripts, analyst outputs, auditor evaluations, or extracted decision codebooks. Only aggregated results are shown."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3 describes the full experimental procedure: how AI analysts are configured, what inputs they receive, how runs are audited, and how decisions are extracted."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. The 'subjects' are AI agent runs, whose generation procedure is described in Section 3.2."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Pipeline documented: 4,946 total runs → compliance screening → 3,303 compliant (67%). Exclusion rates broken down by model and persona in Table 2. Decision extraction codebook process described in Section 3.3."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding disclosure or acknowledgments section listing grants or corporate sponsorship. Authors are at Amazon AWS but no explicit funding statement."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations clearly listed: Amazon AWS (Bertran, Wu), 'Work done while at Amazon AWS' (Fogliato), and Carnegie Mellon University (Wu)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Authors are Amazon AWS employees. Amazon has a commercial interest in AI systems and their perceived reliability. The paper's findings about analytical variability and steerability could affect perceptions of AI-powered analytics products."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial disclosure statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper discusses contamination qualitatively (soccer = 'high-contamination', metr-rct = 'unlikely to appear in current training corpora') but does not state specific training data cutoff dates for any of the four models used."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "'In pilot runs we frequently observed AI analysts reproducing qualitative conclusions before inspecting the data' (Section 3.3). The paper explicitly uses contamination level as a design variable across the three datasets."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Contamination is a core design variable: the soccer dataset is 'high-contamination', metr-rct is 'recent and unlikely to appear in current training corpora', and anes-views is 'designed to be low-contamination' (Section 3.1). The auditor filters runs that recall results without analysis."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants. The study uses AI agents as the subjects."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No API costs, token counts, or per-run costs reported despite running 4,946 agent sessions across four commercial/large LLMs."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total compute budget stated. The 250-message/60-minute cap per run is mentioned but total API spend or GPU hours are not reported."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Temperature T=1.0 with ~30 independent runs per cell explicitly produces stochastic variation. The entire paper is about characterizing this variation — dispersion across runs IS the result."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "'We target approximately 30 independent compliant runs per (dataset × model × persona) cell' (Section 3.2). Total: 4,946 runs, 3,303 compliant."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search was conducted for the agent framework itself. T=1.0 and 250-message cap are stated as design choices without exploring alternatives."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "The study does not select a best configuration — it characterizes the distribution across all configurations."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper makes comparisons across 5 personas × 4 models × 3 datasets without applying any multiple comparison correction. No formal statistical tests are used for its own comparisons, so correction is not applied."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The paper does not claim its system is better than alternatives — it studies a phenomenon (analytical variability). Self-comparison bias is not relevant."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "The study does not compare methods at different compute levels. All conditions use the same per-run budget."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper compares AI analyst variability against human many-analyst studies (Silberzahn et al., Breznau et al.) to validate that the framework captures real analytical variability. The soccer dataset replication explicitly tests construct validity."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "All models use the same Inspect AI scaffold with identical tool access. Model comparisons are within the same scaffold, isolating the model variable."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Explicitly addressed: soccer dataset is 'high-contamination' because results are widely known, metr-rct is recent and unlikely in training corpora, anes-views is 'designed to be low-contamination' (Section 3.1)."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the task prompt or dataset structure leaks information about expected results beyond the explicit contamination discussion."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether runs within the same model-persona cell are truly independent given shared model weights and prompt structure."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "The auditor detects contamination behaviorally: 'some analysts could fully recall the results from Silberzahn et al. and reported those' — these runs are flagged and filtered (Section 3.3)."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "AI analysts reproduce the defining signature of many-analyst studies: systematic dispersion in defensible conclusions under a shared hypothesis, dataset, and estimand.",
    364       "evidence": "Across 3,303 compliant runs on 3 datasets, AI analysts show wide dispersion in effect sizes, p-values, and binary support decisions (Figures 1–5, Section 4).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "The outcome distribution is steerable: varying persona or base model shifts the distribution of evidential outcomes even after auditor-based filtering.",
    369       "evidence": "Comparing Negative to Strong CS persona yields support rate differences of 34 pp (anes-views) to 66 pp (metr-rct). Figure 2 shows systematic shifts by persona and model (Section 4.2).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Confirmation-seeking personas produce systematically lower p-values.",
    374       "evidence": "Figure 3 shows p-value distributions shifted downward for CS and Strong CS personas across datasets, both before and after compliance filtering.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Auditing filters invalid runs but does not eliminate dispersion.",
    379       "evidence": "34% of runs excluded (Table 2), but compliant runs still show wide dispersion in Figures 1–5. The separation in p-value distributions persists after filtering (Figure 3 bottom panel).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "These results highlight a central challenge for an AI-automated future of empirical science.",
    384       "evidence": "Based on 4 LLMs and 3 datasets. The generalization to 'empirical science' broadly is not empirically supported beyond these settings.",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Company conflict not disclosed",
    391       "detail": "All authors are affiliated with Amazon AWS. Amazon sells AI services and has a commercial interest in how AI analytical tools are perceived. No competing interests statement is provided."
    392     },
    393     {
    394       "flag": "No formal statistical tests on own claims",
    395       "detail": "The paper reports large percentage-point differences between conditions (34–66 pp) but applies no significance tests, confidence intervals, or effect size measures to its own between-condition comparisons. The statistical rigor applied to analyzing AI analyst outputs is not applied to the paper's own experimental comparisons."
    396     },
    397     {
    398       "flag": "No data or code release",
    399       "detail": "Despite running 4,946 agent sessions generating reproducible transcripts, neither the raw transcripts, extracted decisions, auditor evaluations, nor the analysis code are released. This limits independent verification of the reported results."
    400     },
    401     {
    402       "flag": "Unbounded generalization",
    403       "detail": "The paper generalizes from 4 LLMs on 3 datasets to claims about 'an AI-automated future of empirical science' without bounding the scope of its conclusions."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Measuring the impact of early-2025 ai on experienced open-source developer productivity",
    409       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    410       "year": 2025,
    411       "arxiv_id": "2507.09089",
    412       "relevance": "The METR coding RCT, one of the three datasets used in this study; directly relevant to AI programming productivity evaluation."
    413     },
    414     {
    415       "title": "Swe-bench: Can language models resolve real-world github issues?",
    416       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    417       "year": 2023,
    418       "arxiv_id": "2310.06770",
    419       "relevance": "Major LLM code generation benchmark referenced in the related work on agentic evaluation."
    420     },
    421     {
    422       "title": "Data interpreter: An llm agent for data science",
    423       "authors": ["Sirui Hong"],
    424       "year": 2024,
    425       "arxiv_id": "2402.18679",
    426       "relevance": "LLM agent for automated data science, directly relevant to agentic AI coding and analysis workflows."
    427     },
    428     {
    429       "title": "React: Synergizing reasoning and acting in language models",
    430       "authors": ["Shunyu Yao"],
    431       "year": 2022,
    432       "relevance": "The ReAct agent framework used as the computational scaffold for the AI analysts in this study."
    433     },
    434     {
    435       "title": "Swe-agent: Agent-computer interfaces enable automated software engineering",
    436       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    437       "year": 2024,
    438       "relevance": "Agentic software engineering agent referenced in the related work on LLM-based coding tools."
    439     },
    440     {
    441       "title": "BLADE: Benchmarking language model agents for data-driven science",
    442       "authors": ["Ken Gu"],
    443       "year": 2024,
    444       "arxiv_id": "2408.09667",
    445       "relevance": "Most closely related work — benchmarks LLM agents on data science tasks; this paper explicitly positions against it."
    446     },
    447     {
    448       "title": "Toolformer: Language models can teach themselves to use tools",
    449       "authors": ["Timo Schick"],
    450       "year": 2023,
    451       "relevance": "Foundational work on tool-using LLMs, underlying the agentic approach used in this study."
    452     },
    453     {
    454       "title": "Measuring ai ability to complete long tasks",
    455       "authors": ["Thomas Kwa"],
    456       "year": 2025,
    457       "arxiv_id": "2503.14499",
    458       "relevance": "Benchmark for long-horizon AI task completion, relevant to evaluating agentic AI capabilities."
    459     },
    460     {
    461       "title": "The ai scientist-v2: Workshop-level automated scientific discovery via agentic tree search",
    462       "authors": ["Yutaro Yamada"],
    463       "year": 2025,
    464       "arxiv_id": "2504.08066",
    465       "relevance": "Automated scientific discovery agent relevant to AI-driven research and the risks of automated analysis."
    466     },
    467     {
    468       "title": "Ds-agent: Automated data science by empowering large language models with case-based reasoning",
    469       "authors": ["Siyuan Wang"],
    470       "year": 2024,
    471       "arxiv_id": "2402.17753",
    472       "relevance": "LLM-powered data science agent directly relevant to the agentic data analysis paradigm studied."
    473     },
    474     {
    475       "title": "Discovering language model behaviors with model-written evaluations",
    476       "authors": ["Ethan Perez"],
    477       "year": 2023,
    478       "relevance": "Documents sycophancy and other LLM behavioral issues relevant to understanding AI analyst bias."
    479     }
    480   ]
    481 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs