scan-v5.json (27562B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Interactive Debugging and Steering of Multi-Agent AI Systems", 6 "authors": [ 7 "Will Epperson", 8 "Gagan Bansal", 9 "Victor Dibia", 10 "Adam Fourney", 11 "Jack Gerrits", 12 "Erkang Zhu", 13 "Saleema Amershi" 14 ], 15 "year": 2025, 16 "venue": "International Conference on Human Factors in Computing Systems", 17 "arxiv_id": "2503.02068", 18 "doi": "10.1145/3706598.3713581" 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "All abstract claims—formative interviews with five developers, AGDebugger prototype, 14-participant two-part user study, three edit strategies, and importance of message resets—are directly supported by content in Sections 4–6.", 26 "source": "haiku" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": false, 31 "justification": "Part 2 (the main evaluation) has no comparison condition—all 8 participants used only AGDebugger—so causal claims that AGDebugger 'facilitates' or 'effectively supports' debugging cannot be established; Part 1's baseline comparison covers only the identification phase with n=6.", 32 "source": "haiku" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": true, 37 "justification": "Section 6.5 explicitly bounds results to two GAIA Level-1 tasks, 30-minute sessions, and participants from one tech company, and notes these may not generalize to other task types or longer development periods.", 38 "source": "haiku" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper does not consider alternative explanations for high feature ratings—novelty effects, demand characteristics, or the fact that all participants are from the same organization as the tool developers are not discussed.", 44 "source": "haiku" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper explicitly distinguishes Likert helpfulness ratings from actual debugging success ('steering agents towards the exact correct answer proved quite difficult; two out of eight participants were able to steer agents to the correct answer'), making the gap between perceived and actual effectiveness clear.", 50 "source": "haiku" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": true, 57 "justification": "Section 6.5 is a dedicated 'User Study Limitations' section listing multiple specific limitations beyond a single sentence in the conclusion.", 58 "source": "haiku" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": true, 63 "justification": "Specific threats named include: 30-minute sessions being insufficient, testing on only two GAIA benchmark tasks, and the tool not being evaluated during active agent development (only on pre-recorded failing runs).", 64 "source": "haiku" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": true, 69 "justification": "The paper explicitly states results are scoped to the AutoGen framework, two GAIA Level-1 tasks, 30-minute debugging sessions, and participants with LLM experience at a single organization.", 70 "source": "haiku" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": false, 77 "justification": "The acknowledgments thank colleagues and reviewers but no funding source is disclosed anywhere in the paper.", 78 "source": "haiku" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "Author affiliations (CMU and Microsoft Research) are clearly listed on the title page, and two formative interview participants are identified as 'core contributors to the open source [AutoGen] project.'", 84 "source": "haiku" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": true, 88 "answer": false, 89 "justification": "Most authors are Microsoft Research employees evaluating AGDebugger, a tool built on AutoGen (a Microsoft open-source framework); study participants were also recruited from Microsoft, creating direct organizational interest in positive outcomes.", 90 "source": "haiku" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "No competing interests statement or financial interests declaration appears in the paper.", 96 "source": "haiku" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": false, 103 "justification": "'Steering' is used throughout as a core concept but never formally defined; 'debugging' is described operationally through examples rather than given a precise definition distinguishing it from related activities like monitoring or testing.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 1 lists three explicit contributions: (1) formative interview findings, (2) AGDebugger prototype with three key features, and (3) user study results with identified editing strategies.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 2 substantively engages with prior work on ML debugging tools, LLM prompt engineering, agent frameworks, and interactive systems, positioning AGDebugger as extending interactive debugging from single LLM calls and fixed pipelines to multi-agent teams.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "empirical": { 122 "artifacts": { 123 "code_released": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper states 'AGDebugger is available as an open source tool at https://github.com/microsoft/agdebugger' with no caveats about future release.", 127 "source": "haiku" 128 }, 129 "data_released": { 130 "applies": true, 131 "answer": true, 132 "justification": "The GAIA benchmark used for study tasks is publicly available; the paper uses it unmodified from the standard validation set.", 133 "source": "haiku" 134 }, 135 "environment_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "No requirements file, Dockerfile, or dependency specifications are provided in the paper; the GitHub repository is mentioned but environment specs are not described.", 139 "source": "haiku" 140 }, 141 "reproduction_instructions": { 142 "applies": true, 143 "answer": false, 144 "justification": "No step-by-step instructions for reproducing the user study or the agent debugging sessions are provided; the tool is released but study reproduction is not documented.", 145 "source": "haiku" 146 } 147 }, 148 "statistical_methodology": { 149 "confidence_intervals_or_error_bars": { 150 "applies": true, 151 "answer": true, 152 "justification": "Figure 7 caption explicitly states 'Mean scores are plotted along with a 95% confidence interval' for system and feature usability ratings.", 153 "source": "haiku" 154 }, 155 "significance_tests": { 156 "applies": true, 157 "answer": false, 158 "justification": "No statistical significance tests are applied to any comparative claim, including the Part 1 AGDebugger vs. baseline preference (5/6) or feature rating comparisons, despite the paper making ordinal comparisons across conditions.", 159 "source": "haiku" 160 }, 161 "effect_sizes_reported": { 162 "applies": true, 163 "answer": false, 164 "justification": "No standardized effect sizes are reported; comparisons are given only as means and proportions without Cohen's d or equivalent measures.", 165 "source": "haiku" 166 }, 167 "sample_size_justified": { 168 "applies": true, 169 "answer": false, 170 "justification": "No power analysis or sample size justification is provided for the n=5 interviews, n=6 Part 1, or n=8 Part 2 participant counts.", 171 "source": "haiku" 172 }, 173 "variance_reported": { 174 "applies": true, 175 "answer": true, 176 "justification": "95% confidence intervals in Figure 7 implicitly report variance for the Likert ratings across participants.", 177 "source": "haiku" 178 } 179 }, 180 "evaluation_design": { 181 "baselines_included": { 182 "applies": true, 183 "answer": true, 184 "justification": "Part 1 includes a 'reduced version of the system that lacked the ability to reset messages or the overview visualization' as a baseline comparison condition.", 185 "source": "haiku" 186 }, 187 "baselines_contemporary": { 188 "applies": true, 189 "answer": true, 190 "justification": "The baseline represents current developer practice (reading log files without interactive features), which is an appropriate and realistic comparison for the research question.", 191 "source": "haiku" 192 }, 193 "ablation_study": { 194 "applies": true, 195 "answer": true, 196 "justification": "Part 1's comparison of full AGDebugger vs. version without edit/reset and overview visualization effectively ablates these two feature groups.", 197 "source": "haiku" 198 }, 199 "multiple_metrics": { 200 "applies": true, 201 "answer": true, 202 "justification": "The study uses task success rate (2/8 correct), edit counts, Likert ratings across multiple questions, time-to-first-edit, and qualitative categorization of edit types.", 203 "source": "haiku" 204 }, 205 "human_evaluation": { 206 "applies": true, 207 "answer": true, 208 "justification": "The entire study consists of human evaluation—participants assess tool usefulness through task performance and post-study Likert surveys.", 209 "source": "haiku" 210 }, 211 "held_out_test_set": { 212 "applies": false, 213 "answer": false, 214 "justification": "This is a user study of a debugging tool, not a prediction task requiring train/test splits.", 215 "source": "haiku" 216 }, 217 "per_category_breakdown": { 218 "applies": true, 219 "answer": true, 220 "justification": "Edit types are broken down into three categories (add, simplify, modify) with counts (14/5/5 of 24 total), and feature ratings are reported separately for each of the three primary features.", 221 "source": "haiku" 222 }, 223 "failure_cases_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Section 7.1 discusses failure cases including non-resettable actions, edits failing to change behavior due to LLM attention over long contexts, and participants unable to steer agents successfully.", 227 "source": "haiku" 228 }, 229 "negative_results_reported": { 230 "applies": true, 231 "answer": true, 232 "justification": "The paper honestly reports that only 2/8 participants achieved the correct answer, that no participants used the agent configuration feature, and that edits late in conversations often had no observable effect.", 233 "source": "haiku" 234 } 235 }, 236 "setup_transparency": { 237 "model_versions_specified": { 238 "applies": true, 239 "answer": false, 240 "justification": "The Magentic-One agent team is referenced but no specific LLM versions, snapshot dates, or model names are given for the models powering the five agents during the studies.", 241 "source": "haiku" 242 }, 243 "prompts_provided": { 244 "applies": true, 245 "answer": false, 246 "justification": "System prompts for the Orchestrator, Web Surfer, Coder, Executor, and File Surfer agents are not provided; only the interview questions and user study survey questions appear in the appendices.", 247 "source": "haiku" 248 }, 249 "hyperparameters_reported": { 250 "applies": true, 251 "answer": false, 252 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for any agent in the Magentic-One team used in the studies.", 253 "source": "haiku" 254 }, 255 "scaffolding_described": { 256 "applies": true, 257 "answer": true, 258 "justification": "Section 3 describes the AutoGen framework, the five-agent Magentic-One team structure, agent roles, message-passing architecture, and tool capabilities in sufficient detail to understand the system under study.", 259 "source": "haiku" 260 }, 261 "data_preprocessing_documented": { 262 "applies": true, 263 "answer": false, 264 "justification": "The preprocessing of agent logs for the user study (which specific failing runs were selected, how they were prepared for participants) is not documented.", 265 "source": "haiku" 266 } 267 }, 268 "data_integrity": { 269 "raw_data_available": { 270 "applies": true, 271 "answer": false, 272 "justification": "Interview transcripts, study recordings, edit logs, and Likert rating data are not released publicly.", 273 "source": "haiku" 274 }, 275 "data_collection_described": { 276 "applies": true, 277 "answer": true, 278 "justification": "Section 6 describes the study procedure in detail: session length, task assignments, conditions, counterbalancing, think-aloud protocol, and post-task surveys.", 279 "source": "haiku" 280 }, 281 "recruitment_methods_described": { 282 "applies": true, 283 "answer": true, 284 "justification": "Recruitment is described: participants were recruited from a large technology corporation with specific experience requirements (LLM experience, CS background); participant roles and experience levels are listed.", 285 "source": "haiku" 286 }, 287 "data_pipeline_documented": { 288 "applies": true, 289 "answer": false, 290 "justification": "The analysis pipeline from raw data to findings is not fully documented—thematic coding methodology, inter-rater reliability for edit categorization, and Likert data aggregation procedures are not described in detail.", 291 "source": "haiku" 292 } 293 }, 294 "contamination": { 295 "training_cutoff_stated": { 296 "applies": false, 297 "answer": false, 298 "justification": "This paper evaluates a debugging tool for usability, not model capabilities on benchmarks; training data contamination is not relevant to the research question.", 299 "source": "haiku" 300 }, 301 "train_test_overlap_discussed": { 302 "applies": false, 303 "answer": false, 304 "justification": "Not applicable for the same reason—the study measures human use of a debugging interface, not model benchmark performance.", 305 "source": "haiku" 306 }, 307 "benchmark_contamination_addressed": { 308 "applies": false, 309 "answer": false, 310 "justification": "Not applicable—GAIA tasks are used as realistic debugging scenarios for participants, not to evaluate model capabilities in a contamination-sensitive way.", 311 "source": "haiku" 312 } 313 }, 314 "human_studies": { 315 "pre_registered": { 316 "applies": true, 317 "answer": false, 318 "justification": "No pre-registration of the user study is mentioned anywhere in the paper.", 319 "source": "haiku" 320 }, 321 "irb_or_ethics_approval": { 322 "applies": true, 323 "answer": false, 324 "justification": "No IRB approval, ethics board review, or institutional ethics statement appears anywhere in the paper despite involving human participants.", 325 "source": "haiku" 326 }, 327 "demographics_reported": { 328 "applies": true, 329 "answer": false, 330 "justification": "Participant roles (graduate student, research scientist) and experience levels are described, but demographic characteristics (age, gender, etc.) are not reported.", 331 "source": "haiku" 332 }, 333 "inclusion_exclusion_criteria": { 334 "applies": true, 335 "answer": true, 336 "justification": "Part 1 required 'backgrounds in computer science and experience working with LLMs'; Part 2 required LLM experience with varied agent development experience explicitly stratified across four levels.", 337 "source": "haiku" 338 }, 339 "randomization_described": { 340 "applies": true, 341 "answer": true, 342 "justification": "Part 1 explicitly states 'The order of log reviews and system conditions was randomized and counterbalanced.'", 343 "source": "haiku" 344 }, 345 "blinding_described": { 346 "applies": true, 347 "answer": false, 348 "justification": "No blinding is described or possible given that participants interact with visually distinct tool versions; this limitation is not acknowledged.", 349 "source": "haiku" 350 }, 351 "attrition_reported": { 352 "applies": true, 353 "answer": false, 354 "justification": "No attrition statement is provided; while apparently all participants completed the study, this is not explicitly confirmed.", 355 "source": "haiku" 356 } 357 }, 358 "cost_and_practicality": { 359 "inference_cost_reported": { 360 "applies": true, 361 "answer": false, 362 "justification": "No inference cost, API costs, or latency figures are reported for the multi-agent runs used in the study (71-90 message conversations using LLM calls).", 363 "source": "haiku" 364 }, 365 "compute_budget_stated": { 366 "applies": true, 367 "answer": false, 368 "justification": "No total compute budget or resource expenditure is stated for running the agent systems across formative interviews or user study sessions.", 369 "source": "haiku" 370 } 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "Developers face three core challenges with multi-agent debugging: reviewing long conversations to localize errors, lack of interactive debugging support, and slow iteration on agent configurations.", 377 "evidence": "Five formative interviews with experienced AutoGen developers at Microsoft yielded consistent themes across all three pain points, described in Sections 4.1–4.3.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "Message editing and reset is the most valued AGDebugger feature, rated 4.9/5 by Part 2 participants.", 382 "evidence": "Figure 7 shows mean ratings from 8 participants; message resetting scored 4.9/5 vs. 4.1/5 for both sending messages and overview visualization.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Participants used three strategies for steering agents: adding specific instructions (14/24 edits, 58%), simplifying instructions (5/24), and modifying plans (5/24).", 387 "evidence": "Section 6.4 reports direct annotation of 24 total edits across 8 participants in Part 2, with examples provided in Figure 9.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Interactive debugging of multi-agent systems is inherently difficult; only 2 of 8 participants successfully steered agents to the correct answer.", 392 "evidence": "Section 6.3 reports this directly: 'two out of eight participants were able to steer agents towards the exact right answer.'", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Edits to earlier messages in the conversation are more effective for steering than edits to later messages.", 397 "evidence": "Both successful participants edited messages early in the conversation; the paper observes that LLMs may not attend to edited messages appearing later in long contexts (citing 'lost in the middle' literature).", 398 "supported": "weak" 399 }, 400 { 401 "claim": "Participants spent approximately 10 of 15 minutes reading logs before starting to edit, indicating error localization dominates debugging time.", 402 "evidence": "Section 6.2 states 'participants taking about 10 minutes on average (out of the total 15 minutes allocated per task) before starting to experiment with edits.'", 403 "supported": "strong" 404 } 405 ], 406 "methodology_tags": [ 407 "qualitative", 408 "case-study" 409 ], 410 "key_findings": "Formative interviews with five expert developers identified three core multi-agent debugging challenges: navigating long agent conversations, lack of interactive control, and slow configuration iteration. AGDebugger, an interactive tool enabling message editing and conversation reset via checkpointing, was evaluated in a two-part user study with 14 participants; message editing/reset was rated 4.9/5 for helpfulness. Three distinct steering strategies emerged: adding specificity, simplifying instructions, and modifying plans. Despite high usability ratings, only 2/8 participants achieved correct agent outputs, and edits late in long conversations often failed due to LLM attention limitations over long contexts.", 411 "red_flags": [ 412 { 413 "flag": "Tiny samples", 414 "detail": "n=5 formative interviews, n=6 Part 1, n=8 Part 2 — all from a single organization (Microsoft). These sizes preclude reliable quantitative inference." 415 }, 416 { 417 "flag": "In-house evaluation", 418 "detail": "Most authors are Microsoft Research employees; participants were recruited from within the same company; two formative interviewees are core AutoGen contributors, creating strong organizational interest in positive findings." 419 }, 420 { 421 "flag": "No IRB disclosure", 422 "detail": "Human participant study with no mention of ethics approval or IRB review, which is atypical for a CHI paper." 423 }, 424 { 425 "flag": "No model versions", 426 "detail": "The LLM models powering the five-agent Magentic-One team used throughout all studies are not identified by version or snapshot date." 427 }, 428 { 429 "flag": "Part 2 lacks control condition", 430 "detail": "The main evaluation (Part 2, n=8) has no comparison condition; all participants used AGDebugger with all features, making it impossible to attribute outcomes to specific design choices." 431 }, 432 { 433 "flag": "No inter-rater reliability for qualitative coding", 434 "detail": "Edit categorization (add/simplify/modify) and thematic coding of interview data are reported without any inter-rater reliability assessment." 435 }, 436 { 437 "flag": "No significance testing", 438 "detail": "Comparative claims (5/6 preference, feature rating differences) are made without statistical tests despite the paper presenting quantitative comparisons." 439 } 440 ], 441 "cited_papers": [ 442 { 443 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 444 "relevance": "Core framework underlying AGDebugger and all agent teams studied; central to understanding the multi-agent paradigm being debugged." 445 }, 446 { 447 "title": "GAIA: A Benchmark for General AI Assistants", 448 "relevance": "The benchmark dataset used for all agent tasks in the user study; establishes context for task difficulty and agent evaluation." 449 }, 450 { 451 "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks", 452 "relevance": "The five-agent team used in the user study; understanding its design is essential for interpreting study results." 453 }, 454 { 455 "title": "AutoGen Studio: A No-Code Developer Tool for Building and Debugging Multi-Agent Systems", 456 "relevance": "Directly related prior work on agent debugging interfaces; AGDebugger explicitly positions itself relative to AutoGen Studio's limitations." 457 }, 458 { 459 "title": "PromptChainer: Chaining Large Language Model Prompts through Visual Programming", 460 "relevance": "Prior work on debugging pipelines of LLM calls; AGDebugger explicitly extends this to multi-agent settings." 461 }, 462 { 463 "title": "ChainForge: A Visual Toolkit for Prompt Engineering and LLM Hypothesis Testing", 464 "relevance": "Related debugging/testing tool for LLM pipelines; cited as prior work that does not address multi-agent systems." 465 }, 466 { 467 "title": "OpenDevin: An Open Platform for AI Software Developers as Generalist Agents", 468 "relevance": "Comparable agent platform cited as lacking robust multi-turn debugging features." 469 }, 470 { 471 "title": "Lost in the Middle: How Language Models Use Long Contexts", 472 "relevance": "Provides theoretical grounding for the observed failure mode where edited messages late in long conversations have minimal effect." 473 }, 474 { 475 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 476 "relevance": "Representative agent benchmark establishing context for complex coding agent evaluation." 477 } 478 ], 479 "engagement_factors": { 480 "practical_relevance": { 481 "score": 3, 482 "justification": "AGDebugger is immediately usable as open-source software, directly addressing a pain point felt by anyone building multi-agent LLM systems." 483 }, 484 "surprise_contrarian": { 485 "score": 1, 486 "justification": "Findings largely confirm expected challenges; the observation that debugging multi-agent systems is hard is not surprising, though the 2/8 success rate quantifies the difficulty concretely." 487 }, 488 "fear_safety": { 489 "score": 1, 490 "justification": "Briefly raises the safety concern of non-resettable agent actions (e.g., an agent sending an email) but this is a minor discussion point, not a primary finding." 491 }, 492 "drama_conflict": { 493 "score": 0, 494 "justification": "No controversy or conflict angle; the paper is a straightforward tool design and evaluation paper." 495 }, 496 "demo_ability": { 497 "score": 3, 498 "justification": "Open-source code is released at github.com/microsoft/agdebugger; anyone with the AutoGen framework can try the tool immediately." 499 }, 500 "brand_recognition": { 501 "score": 2, 502 "justification": "Microsoft Research and CMU are well-known institutions; the CHI venue is prestigious in HCI; AutoGen and Magentic-One are recognized names in the agent community." 503 } 504 }, 505 "hn_data": { 506 "threads": [ 507 { 508 "hn_id": "45235119", 509 "title": "Instruction-Following Pruning for Large Language Models", 510 "points": 5, 511 "comments": 0, 512 "url": "https://news.ycombinator.com/item?id=45235119", 513 "created_at": "2025-09-13T20:37:50Z" 514 }, 515 { 516 "hn_id": "43240687", 517 "title": "Flash Interpretability: Decoding Specialised Feature Neurons in LLM", 518 "points": 1, 519 "comments": 0, 520 "url": "https://news.ycombinator.com/item?id=43240687", 521 "created_at": "2025-03-03T11:26:53Z" 522 } 523 ], 524 "top_points": 5, 525 "total_points": 6, 526 "total_comments": 0 527 } 528 }