scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23256B)
      1 {
      2   "paper": {
      3     "title": "Overseeing Agents Without Constant Oversight: Challenges and Opportunities",
      4     "authors": [
      5       "Madeleine Grunde-McLaughlin",
      6       "Hussein Mozannar",
      7       "Maya Murad",
      8       "Jingya Chen",
      9       "Saleema Amershi",
     10       "Adam Fourney"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.16844"
     15   },
     16   "scan_version": 2,
     17   "active_modules": [],
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL or code release is mentioned. The studies used Magentic-UI but no code for the novel interface or analysis scripts is provided."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No dataset or study data is released. Tasks are described in supplementary materials within the paper, but raw participant responses and logs are not made available."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No environment specifications, dependency lists, or setup instructions are provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No reproduction instructions are provided. The Wizard-of-Oz study designs are described but not with enough detail to replicate the hand-created annotations and interfaces."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "95% confidence intervals are reported for all effect sizes in the controlled study (Table 3, Figure 6), calculated using the noncentral t distribution."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The authors explicitly chose not to report p-values, instead reporting effect sizes and CIs per recommendations for fair statistical communication in HCI (Section 5.2, citing Dragicevic 2016). While principled, no formal significance tests are used."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Hedges' g effect sizes are reported throughout the controlled study: accuracy (0.18), duration (-0.29), confidence (0.56), with breakdowns by condition (Table 3, Figure 6). Cutoffs of 0.2, 0.5, 0.8 for small/medium/large are stated."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "N=12 per study is not justified. The paper acknowledges 'our study does not have high power due to resource constraints' (Section 5.2) but does not provide a power analysis or formal justification for the sample size."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Standard deviations are reported for all conditions in Table 3 (accuracy, duration, confidence) alongside means."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The controlled study (Study 3) compares a treatment condition against a baseline recreating the Magentic-UI interface (Section 5.1-5.2). The design probes study compares three alternative designs."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baseline is the current Magentic-UI interface, which represents the state of the art for HIL CUA systems. The system uses GPT-5 (Section 5.2), described as 'state-of-the-art model at the time of the study.'"
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No ablation study isolates which components of the treatment interface (requirements, assumptions, linking, annotations) contribute to the observed effects. The design probes study explores alternatives but is not a controlled ablation."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Three metrics are used in the controlled study: accuracy, duration, and confidence (Table 3, Figure 6)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The entire study is a human evaluation — participants judge whether agent outputs are correct, with qualitative thematic analysis of their reasoning."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "This is a user study, not a machine learning evaluation. No train/test split is applicable."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by ground truth answer (correct/incorrect) crossed with user answer (Yes/No), yielding Yes-Yes, No-No, No-Yes, Yes-No subsets (Table 3, Figure 6)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Extensive discussion of failure cases: overreliance justified by 'reasonable process' (Section 4.3.2), missed errors in formative study (Section 3.3.3), increased false confidence in controlled study (Section 5.3.2)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Key negative results are prominently reported: the treatment interface did not meaningfully improve accuracy (Hedges' g: 0.18) and increased false confidence when users were wrong (Hedges' g: 0.85). These are central findings."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims match results: 'current practices are cumbersome' (Study 1), 'proposed design reduced the time participants spent finding errors' (g: -0.65 for correct answers), 'accuracy was not meaningfully improved' (g: 0.18), 'higher levels of confidence' (g: 0.56)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper uses causal language ('reduced the time,' 'increased false confidence') from a within-subjects counterbalanced design with n=12. While counterbalancing helps, the small sample and lack of randomization to order mean confounds (learning effects, task difficulty differences) are not fully controlled."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The Limitations section (Section 7) explicitly bounds generalization: participants are tech firm employees not representative of broader users, only one CUA system tested, only post-hoc verification studied, and Wizard-of-Oz limitations acknowledged."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 6.1 discusses that browser access may have evened performance between conditions. The paper considers that green checkmarks on confirmed requirements may contribute to overconfidence (Section 6.1). Learning curve effects are noted (Section 5.3.1)."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper clearly distinguishes its proxy measurements (binary correctness judgment, self-reported confidence on 1-7 scale, task duration) from broader claims about 'effective human oversight.' Section 7 notes the confidence metric's limitation: 'Further exploring this strong effect with a more nuanced metric of confidence would better define this issue.'"
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Study 2 uses 'OpenAI's GPT-4o' and Study 3 uses 'OpenAI's GPT-5' — both marketing names without snapshot dates or API versions (Sections 4.2, 5.2)."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The paper evaluates Magentic-UI as a black-box CUA system; the researchers did not design prompts — they designed UI interfaces for human oversight. No prompting is part of the paper's contribution."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The paper does not tune or configure model hyperparameters — it evaluates a pre-built CUA system's interface. Hyperparameters are not part of the experimental design."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "Magentic-UI is evaluated as a third-party tool; the authors cannot describe its internal scaffolding. The paper's contribution is the UI layer, not the agent scaffolding."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Study procedures are described in detail: task creation from AssistantBench, task selection criteria, WoZ annotation methodology following the trace, thematic analysis process (line-by-line coding, 5 rounds inductive/deductive for Study 1, 3 rounds for Study 2, 3 rounds for Study 3)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 'Limitations' is a dedicated section discussing multiple specific limitations of the study."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 discusses specific threats: non-representative participants (tech firm employees), single CUA system, only post-hoc verification tested, self-defined confidence scale, WoZ vs. automated implementation differences."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 7 explicitly states what was not tested: other CUA systems, non-tech-firm users, real-time verification during execution, fully implemented (non-WoZ) interfaces."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw data (participant responses, timing logs, interview transcripts) is made available."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data collection is described in detail for all three studies: in-person procedures, consent forms, task sequences, think-aloud protocol, questionnaires, semi-structured interviews, timing measurements (Sections 3.2, 4.2, 5.2)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "Participants are described as 'employees of a large tech firm' with 'some experience using agents' but no details on recruitment channels, how they were selected, or whether recruitment could introduce bias. Study 2 notes 8 of 12 participated in Study 1 but doesn't explain why."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The qualitative analysis pipeline is documented: line-by-line coding, followed by multiple rounds of inductive and deductive thematic analysis (5 rounds for Study 1, 3 for Study 2, 3 for Study 3). Quantitative measures (accuracy, duration, confidence) are defined with clear endpoints."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding statement is present. The acknowledgments thank 'the Microsoft Research AI Frontiers group for their feedback' but do not disclose funding sources."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: 5 of 6 authors are from Microsoft Research, and the first author completed the work during an internship at Microsoft Research."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Microsoft Research authors evaluate Magentic-UI, a Microsoft Research product. The funder/employer has a direct interest in demonstrating that their HIL approach has value and can be improved."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It evaluates human oversight of a CUA system."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No model capability benchmark evaluation is performed."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No model capability benchmark evaluation is performed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No pre-registration is mentioned for any of the three studies."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Section 3.2 states: 'Our studies were approved by our internal institutional review board (IRB).'"
    257       },
    258       "demographics_reported": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "Participants are described only as 'employees of a large tech firm' who are 'familiar with agentic workflows.' No demographics (gender, age, experience level, role) are reported."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "No formal inclusion/exclusion criteria are stated. Study 1 participants 'had not previously used Magentic-UI.' Study 2 notes 8 had participated in Study 1. Study 3 had no participant overlap with prior studies. But no systematic criteria are described."
    267       },
    268       "randomization_described": {
    269         "applies": true,
    270         "answer": true,
    271         "justification": "Study 2: 'The order of the methods and tasks was randomized' (Section 4.2). Study 3: 'the order of these conditions and their associated tasks balanced across all participants' (Section 5.2)."
    272       },
    273       "blinding_described": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No blinding is described. Participants could see which interface they were using (baseline vs. treatment), and the study conductor was not blinded. No discussion of whether this affects results."
    277       },
    278       "attrition_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Study 1 reports incomplete tasks: Table 4 shows timing and technical incomplete counts per task. Study 3 reports 10 in-person and 2 virtual participants with all completing the study. No attrition in Studies 2 or 3."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "This is a user study paper evaluating UI design for human oversight, not proposing a computational method."
    289       },
    290       "compute_budget_stated": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "This is a user study paper, not a computational method."
    294       }
    295     }
    296   },
    297   "claims": [
    298     {
    299       "claim": "Current verbose trace displays leave Human-Agent teams error-prone, with participants missing small but impactful errors.",
    300       "evidence": "Study 1 (Section 3.3.3): Task 1 had 1/9 incorrect, Task 2 had 5/10 incorrect, Task 3 had 10/12 incorrect. Specific examples: CUA estimated youth prices when unavailable (n=6), only half noticed (n=3).",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "The novel specification-based interface reduced time for error finding compared to baseline.",
    305       "evidence": "Section 5.3.1, Table 3: Overall Hedges' g = -0.29 (small effect). For correct user answers on incorrect outputs: Hedges' g = -0.65 (medium effect). Duration decreased from 270.79s to 229.63s for No-No condition.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "The treatment interface did not meaningfully improve accuracy.",
    310       "evidence": "Section 5.3.2, Table 3: Hedges' g = 0.18 for accuracy. Baseline accuracy 72.92% vs treatment 77.08%, with wide CIs [-0.52, 0.94].",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "The treatment interface increased false confidence when participants were incorrect.",
    315       "evidence": "Section 5.3.2, Table 3: Hedges' g = 0.85 (large effect) for confidence in the No-Yes condition. Confidence rose from 3.64 to 4.82 when participants incorrectly judged wrong answers as correct.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "A 'reasonable' process justifies overreliance — participants cited process reasonability when accepting incorrect answers.",
    320       "evidence": "Section 4.3.2: Multiple participant quotes (P2, P4, P7, P8, P10, P11) citing reasonable process for incorrect acceptance. Flowchart (most process-oriented) had lowest accuracy at 39.13% on incorrect tasks.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "qualitative"
    326   ],
    327   "key_findings": "Through three iterative user studies (n=12 each) on a Computer Use Agent, the paper finds that current verbose trace displays are cumbersome and hide errors from users. A novel specification-based interface with requirements, assumptions, and cross-linked annotations reduced error-finding time (Hedges' g: -0.65) but did not improve accuracy (g: 0.18) and increased false confidence when errors were missed (g: 0.85). Process-oriented displays justified overreliance, as participants cited a 'reasonable' process when accepting incorrect outputs.",
    328   "red_flags": [
    329     {
    330       "flag": "Company evaluating own product",
    331       "detail": "5 of 6 authors are Microsoft Research employees evaluating Magentic-UI, a Microsoft Research product. The first author was an MSR intern. While the findings are nuanced (showing limitations), the framing positions the product as a meaningful platform for oversight research."
    332     },
    333     {
    334       "flag": "Very small sample sizes",
    335       "detail": "All three studies use n=12. The controlled study has wide confidence intervals (e.g., accuracy CI [-0.52, 0.94]) that include both meaningful improvement and meaningful harm. The paper acknowledges low power but still draws conclusions about effect directions."
    336     },
    337     {
    338       "flag": "Non-representative participants",
    339       "detail": "All participants are employees of a large tech firm with prior AI agent experience. Results may not generalize to the broader population of agent users. Demographics beyond employer are not reported."
    340     },
    341     {
    342       "flag": "Wizard-of-Oz confound",
    343       "detail": "Studies 2 and 3 use hand-crafted annotations rather than automated generation. The paper acknowledges this (Section 7) but the quality of hand-crafted annotations likely exceeds what an automated system would produce, potentially inflating treatment effects."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Magentic-ui: Towards human-in-the-loop agentic systems",
    349       "authors": ["Hussein Mozannar", "Gagan Bansal", "Cheng Tan", "Adam Fourney"],
    350       "year": 2025,
    351       "arxiv_id": "2507.22358",
    352       "relevance": "The HIL CUA system evaluated in this paper; central to understanding agentic system design for human oversight."
    353     },
    354     {
    355       "title": "Ai agents that matter",
    356       "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S Siegel", "Nitya Nadgir", "Arvind Narayanan"],
    357       "year": 2024,
    358       "relevance": "Raises concerns about agent evaluation focusing solely on accuracy rather than cost, relevant to evaluation methodology critique."
    359     },
    360     {
    361       "title": "Challenges in human-agent communication",
    362       "authors": ["Gagan Bansal", "Jennifer Wortman Vaughan", "Saleema Amershi"],
    363       "year": 2024,
    364       "arxiv_id": "2412.10380",
    365       "relevance": "Discusses granularity challenges in agent trace verbosity, directly relevant to this paper's findings on trace design."
    366     },
    367     {
    368       "title": "Collaborative gym: A framework for enabling and evaluating human-agent collaboration",
    369       "authors": ["Yijia Shao", "Vinay Samuel", "Yucheng Jiang"],
    370       "year": 2024,
    371       "arxiv_id": "2412.15701",
    372       "relevance": "Found 65% communication error rate in real user conditions with agents; validates the oversight challenge this paper addresses."
    373     },
    374     {
    375       "title": "CowPilot: A Framework for Autonomous and Human-Agent Collaborative Web Navigation",
    376       "authors": ["Faria Huq", "Zora Zhiruo Wang"],
    377       "year": 2025,
    378       "relevance": "HIL framework for web agent collaboration, comparable system for human-agent interaction research."
    379     },
    380     {
    381       "title": "When combinations of humans and AI are useful: A systematic review and meta-analysis",
    382       "authors": ["Michelle Vaccaro", "Abdullah Almaatouq", "Thomas Malone"],
    383       "year": 2024,
    384       "relevance": "Meta-review finding Human-AI teams perform worse on average than either alone across 106 studies; key context for oversight effectiveness."
    385     },
    386     {
    387       "title": "To trust or to think: cognitive forcing functions can reduce overreliance on AI in AI-assisted decision-making",
    388       "authors": ["Zana Buçinca", "Maja Barbara Malaya", "Krzysztof Z Gajos"],
    389       "year": 2021,
    390       "relevance": "Foundational work on cognitive forcing functions to reduce AI overreliance, directly relevant to the overreliance findings."
    391     },
    392     {
    393       "title": "Why do multi-agent llm systems fail?",
    394       "authors": ["Mert Cemri"],
    395       "year": 2025,
    396       "arxiv_id": "2503.13657",
    397       "relevance": "Analysis of failure modes in multi-agent LLM systems, relevant to understanding agent errors users must oversee."
    398     },
    399     {
    400       "title": "Swe-bench: Can language models resolve real-world github issues?",
    401       "authors": ["Carlos E Jimenez", "John Yang"],
    402       "year": 2023,
    403       "arxiv_id": "2310.06770",
    404       "relevance": "Major coding agent benchmark; context for the agent capability landscape this paper's oversight work addresses."
    405     },
    406     {
    407       "title": "Practices for governing agentic AI systems",
    408       "authors": ["Yonadav Shavit", "Sandhini Agarwal", "Miles Brundage"],
    409       "year": 2023,
    410       "relevance": "OpenAI's governance practices citing 'legibility of agent activity' as critical, directly motivating this paper's research."
    411     }
    412   ]
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs