scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23142B)
      1 {
      2   "paper": {
      3     "title": "Interactive Debugging and Steering of Multi-Agent AI Systems",
      4     "authors": ["Will Epperson", "Gagan Bansal", "Victor Dibia", "Adam Fourney", "Jack Gerrits", "Erkang Zhu", "Saleema Amershi"],
      5     "year": 2025,
      6     "venue": "CHI '25 (International Conference on Human Factors in Computing Systems)",
      7     "arxiv_id": "2503.02068",
      8     "doi": "10.1145/3706598.3713581"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["qualitative", "case-study"],
     13   "key_findings": "Formative interviews with 5 agent developers revealed core debugging challenges: difficulty reviewing long multi-turn conversations, lack of interactive debugging support, and slow iteration on agent configuration. A user study (n=14) of AGDebugger showed participants used three steering strategies: adding specific instructions (14/24 edits), simplifying instructions (5/24), and modifying agent plans (5/24). Message resetting was rated most helpful (4.9/5), but only 2 of 8 participants successfully steered agents to the correct answer.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'AGDebugger is available as an open source tool at https://github.com/microsoft/agdebugger' (Section 1)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No study data (interview transcripts, survey responses, user study logs, edit histories) is released or linked."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, or dependency details are provided in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step instructions for reproducing the user study or running AGDebugger on the same tasks are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Figure 7 shows mean scores with 95% confidence intervals for system and feature usability ratings."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper compares AGDebugger to a baseline condition in Part 1 and reports Likert ratings, but no statistical significance tests are performed on any comparisons."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No effect sizes (Cohen's d, etc.) are reported. Only raw Likert means and counts of edit types are given."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for why n=5 (formative), n=6 (Part 1), and n=8 (Part 2) were chosen. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Only means and 95% CIs are shown in Figure 7. No standard deviations or variance across participants is reported in tables or text."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Part 1 compares AGDebugger against a 'reduced version of the system that lacked the ability to reset messages or the overview visualization' (Section 6.1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baseline represents current developer workflow (reading logs without interactive features), which is appropriate for this type of tool evaluation."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The system has three features (message sending, resetting/editing, overview visualization) but no ablation study isolates their individual contributions. Part 1 only compares full system vs. reduced version."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple measures are collected: system usability ratings (3 questions), feature-level ratings (3 features), edit type classification, number of edits, time to first edit, task success rate, and qualitative feedback."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The entire evaluation is human-centered: participants used the tool and provided ratings, think-aloud observations, and interview responses."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Not applicable — this is a qualitative user study, not a benchmark evaluation with train/test splits."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by feature (message sending 4.1/5, resetting 4.9/5, overview 4.1/5 in Figure 7) and by edit type (14 add, 5 simplify, 5 modify in Figure 8)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 7.1 discusses open challenges including non-resettable actions, steering requiring deep implementation knowledge, and difficulty determining if edits had effect. Only 2/8 participants succeeded."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that only 2 of 8 participants steered agents to the correct answer (Section 6.3), no participants used the agent configuration feature, and participants were frustrated when agents didn't respond to edits (Section 7.1)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims are well-supported: formative interviews identifying challenges (Section 4), three key features of AGDebugger (Section 5), common steering strategies (Section 6.4), and importance of message resets (Section 6.3, 4.9/5 rating)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper implies AGDebugger's features cause better debugging ('AGDebugger Facilitates Interactive Steering and Debugging' — Section 6.3 heading), but the Part 1 comparison has only n=6 with no statistical testing and confounded conditions (counterbalanced but not properly powered)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper tests on only 2 GAIA Level-1 tasks with the Magentic-One team but makes general claims about 'debugging multi-agent AI systems' and 'interfaces for debugging increasingly important agentic workflows' (abstract). Section 6.5 notes this limitation but the title and abstract are not bounded."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for findings. E.g., the high rating for message resetting could be due to novelty effect, or the 2/8 success rate could reflect task difficulty rather than tool limitations. Section 6.5 mentions limitations but not alternative explanations for results."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures Likert ratings and task success (2/8 correct answers) but frames this as evidence the tool is 'helpful for debugging.' No discussion of whether Likert ratings and short-session success are adequate proxies for real-world debugging effectiveness."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper does not specify which LLM model or version the Magentic-One agents use. No mention of GPT-4 version, temperature, or API details."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The agent prompts (Orchestrator, Web Surfer, etc.) are not provided in the paper or appendix. Only high-level descriptions of agent roles are given."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for the LLM calls made by the agents."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 3.1 describes the AutoGen framework (Python classes, message passing, runtime queue), Section 3.3 describes the 5-agent Magentic-One team with roles, tools, and communication patterns. Section 5.2.1 details checkpointing."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No description of how interview notes were processed, how qualitative codes were developed, or how survey responses were aggregated beyond 'thematic analysis of interview notes' (Section 4)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.5 'User Study Limitations' provides a dedicated subsection discussing limitations of the study design."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6.5 identifies specific threats: 30-minute session may be insufficient, only two GAIA tasks tested, tool not tested during active agent development. These are specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6.5 states 'we tested the system with users on only two benchmark tasks from GAIA' and 'it remains to be seen how the tool assists developers working on other types of tasks.' Also notes concurrent development use was 'not fully captured.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data (survey responses, interview transcripts, task recordings, edit logs) is made available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Sections 4 and 6.1 describe data collection: semi-structured interviews (1 hour, detailed notes), survey questionnaires, task recordings, think-aloud observations, and exit interviews. Questions are in Appendices A and B."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 4 states formative interview participants were 'recruited within a large technology corporation' at Microsoft with AutoGen experience. Section 6.1 describes Part 1 and Part 2 recruitment from the same company with specified backgrounds."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The analysis pipeline is vaguely described: 'thematic analysis of interview notes' (Section 4), 'aggregated the Likert scale ratings' and 'qualitatively coded interview transcripts' (Section 6.1). No detail on coding process, inter-rater reliability, or transformation steps."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure or acknowledgment of financial support. The Acknowledgments section thanks individuals but does not mention funding."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: one CMU author and six Microsoft Research authors. The tool is built on AutoGen, a Microsoft open-source project."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Microsoft Research authors are evaluating a tool built on Microsoft's AutoGen framework using Microsoft's Magentic-One agent team. Microsoft has a commercial interest in the success of its agent ecosystem. This conflict is not acknowledged."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper evaluates a debugging tool, not a pre-trained model's capability on a benchmark. The agents are tools being debugged, not being evaluated for knowledge."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable — the paper evaluates a debugging interface, not model knowledge on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — the GAIA tasks are used as debugging targets, not to evaluate model capability."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No pre-registration is mentioned for either the formative interviews or the user study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No IRB or ethics board approval is mentioned despite conducting interviews and user studies with human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Section 4 reports formative participants: 3 research scientists, 1 software engineer, 1 engineering manager. Section 6.1 reports Part 1: 4 grad students, 2 research scientists. Part 2: 3 research scientists, 5 grad students, with agent experience levels (none to extensive)."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Formative interviews required 'experience building multi-agent applications' with AutoGen (Section 4). Part 1 required 'backgrounds in computer science and experience working with LLMs' (Section 6.1). Part 2 required being 'experienced with LLMs.'"
    263       },
    264       "randomization_described": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Section 6.1 states 'The order of log reviews and system conditions was randomized and counterbalanced' for Part 1. Part 2 participants were 'assigned just one task' though assignment method is not detailed."
    268       },
    269       "blinding_described": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "No blinding is described. Participants could clearly see whether they were using AGDebugger or the reduced baseline version, and no evaluator blinding is mentioned."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No attrition information is reported. It is unclear whether all recruited participants completed the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No costs reported for the LLM API calls made during debugging sessions, despite agents making many LLM calls (71-90 messages per task)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No compute budget is stated for running the agent teams or the debugging sessions."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Agent developers face three core challenges: difficulty reviewing long conversations, lack of interactive debugging support, and need for better tooling to iterate on configurations.",
    296       "evidence": "Formative interviews with 5 developers described in Section 4, with themes synthesized via thematic analysis of interview notes.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Participants frequently made three types of modifications: adding specific instructions (14/24), simplifying instructions (5/24), and modifying agent plans (5/24).",
    301       "evidence": "Section 6.4 reports categorization of all 24 edits across 8 Part 2 sessions, with examples shown in Figure 9.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Message resetting was the most highly rated feature (4.9/5) and every participant used it.",
    306       "evidence": "Section 6.3 and Figure 7 report Likert ratings with 95% CIs. Figure 8 shows all participants edited messages.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "AGDebugger effectively supported participants in iteratively steering agents towards correct behavior.",
    311       "evidence": "Only 2 of 8 participants steered agents to the correct answer (Section 6.3). Most participants improved understanding but not outcomes.",
    312       "supported": "weak"
    313     },
    314     {
    315       "claim": "Edits to earlier messages in the conversation were more effective for steering agents.",
    316       "evidence": "Section 7.1 notes 'Both of the participants who successfully steered the agents towards producing the correct answer editing messages towards the beginning of the conversation rather than at the end.' Based on n=2 successes.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "Company evaluating own product",
    323       "detail": "Six of seven authors are from Microsoft Research, and AGDebugger is built on Microsoft's AutoGen framework using Microsoft's Magentic-One agent team. The user study evaluates the tool's helpfulness without independent evaluation. This conflict is not disclosed or discussed."
    324     },
    325     {
    326       "flag": "Very small sample sizes",
    327       "detail": "Formative interviews n=5, Part 1 n=6, Part 2 n=8. All recruited from within one company (Microsoft). No statistical tests are applied to any comparisons. The 2/8 success rate makes claims about steering effectiveness hard to support."
    328     },
    329     {
    330       "flag": "Convenience sampling from single organization",
    331       "detail": "All participants across both studies were recruited from Microsoft, where AutoGen was developed. This creates potential bias — participants may be more familiar with and favorable toward the underlying framework."
    332     },
    333     {
    334       "flag": "Overclaiming from qualitative evidence",
    335       "detail": "The abstract claims AGDebugger 'effectively supported participants in iteratively steering the agents towards correct behavior,' but only 2/8 succeeded at the steering task. The tool helped understanding more than outcomes."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    341       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    342       "year": 2024,
    343       "relevance": "Core multi-agent framework that AGDebugger extends; foundational work on LLM-powered agent collaboration."
    344     },
    345     {
    346       "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks",
    347       "authors": ["Adam Fourney", "Gagan Bansal", "Hussein Mozannar"],
    348       "year": 2024,
    349       "relevance": "The specific 5-agent team used in the user study; demonstrates state-of-art multi-agent performance on GAIA."
    350     },
    351     {
    352       "title": "GAIA: A Benchmark for General AI Assistants",
    353       "authors": ["Gregoire Mialon", "Thomas Scialom", "Clémentine Fourrier"],
    354       "year": 2024,
    355       "relevance": "Benchmark used to source debugging tasks; standardized evaluation of AI agent capabilities."
    356     },
    357     {
    358       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    359       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    360       "year": 2024,
    361       "relevance": "Major coding agent benchmark requiring multi-step reasoning and tool use."
    362     },
    363     {
    364       "title": "OpenDevin: An Open Platform for AI Software Developers as Generalist Agents",
    365       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    366       "year": 2024,
    367       "arxiv_id": "2407.16741",
    368       "relevance": "Competing platform for multi-agent AI development; compared as related work."
    369     },
    370     {
    371       "title": "AutoGen Studio: A No-Code Developer Tool for Building and Debugging Multi-Agent Systems",
    372       "authors": ["Victor Dibia", "Jingya Chen", "Gagan Bansal"],
    373       "year": 2024,
    374       "arxiv_id": "2408.15247",
    375       "relevance": "Prior agent debugging interface from same team; AGDebugger extends its debugging capabilities."
    376     },
    377     {
    378       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Language Model Society",
    379       "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud"],
    380       "year": 2023,
    381       "arxiv_id": "2303.17760",
    382       "relevance": "Foundational multi-agent framework using role-playing for LLM collaboration."
    383     },
    384     {
    385       "title": "ChainForge: A Visual Toolkit for Prompt Engineering and LLM Hypothesis Testing",
    386       "authors": ["Ian Arawjo", "Chelse Swoopes", "Priyan Vaithilingam"],
    387       "year": 2023,
    388       "arxiv_id": "2309.09128",
    389       "relevance": "Visual tool for debugging LLM pipelines; closest prior work on LLM debugging interfaces."
    390     },
    391     {
    392       "title": "PromptChainer: Chaining Large Language Model Prompts through Visual Programming",
    393       "authors": ["Tongshuang Wu", "Ellen Jiang", "Aaron Donsbach"],
    394       "year": 2022,
    395       "relevance": "Visual programming for LLM chains; identified cascading error problem that AGDebugger addresses."
    396     },
    397     {
    398       "title": "Software engineering for machine learning: a case study",
    399       "authors": ["Saleema Amershi", "Andrew Begel", "Christian Bird"],
    400       "year": 2019,
    401       "doi": "10.1109/ICSE-SEIP.2019.00042",
    402       "relevance": "Foundational SE4ML study on challenges in developing ML systems; motivates need for debugging tools."
    403     },
    404     {
    405       "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    406       "authors": ["Hongliang He", "Wenlin Yao", "Kaixin Ma"],
    407       "year": 2024,
    408       "relevance": "Web browsing agent evaluated on complex web tasks; demonstrates the type of agent behavior AGDebugger debugs."
    409     },
    410     {
    411       "title": "Lost in the Middle: How Language Models Use Long Contexts",
    412       "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt"],
    413       "year": 2024,
    414       "relevance": "Documents LLM attention limitations over long contexts, directly relevant to why agent steering is difficult."
    415     }
    416   ]
    417 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs