scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32859B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DoVer: Intervention-Driven Auto Debugging for LLM Multi-Agent Systems",
      6     "authors": [
      7       "Ming-Jie Ma",
      8       "Jue Zhang",
      9       "Fangkai Yang",
     10       "Yu Kang",
     11       "Qingwei Lin",
     12       "Saravan Rajmohan",
     13       "Dongmei Zhang"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2512.06749",
     18     "doi": "10.48550/arXiv.2512.06749"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims ('flips 18-28% of failed trials', 'up to 16% milestone progress', 'validates or refutes 30-60%', '49% on GSMPlus') are all directly supported by Tables 2 and 3.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The intervention framework is inherently causal — it edits specific steps and re-executes to measure the effect. The ablation design (varying model, few-shot) uses controlled single-variable manipulation. The paper's core design IS a causal methodology for debugging.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 7 (Limitations and Generalizability) explicitly bounds the scope: two agent frameworks, specific task types (web-based info seeking, math), sequential orchestration, and specific model families. States results are 'evidence of feasibility rather than universal guarantees.'",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper discusses LLM stochasticity (mitigated by 3 runs), tool limitations as alternative explanations for Inconclusive cases (Section 5.5), and that intervention failures may reflect sub-agent capability gaps rather than wrong hypotheses.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper explicitly distinguishes between Trial Success Rate (binary task completion) and Progress Made (milestone-based partial credit), and acknowledges in Section 4.2 that milestone evaluation relies on LLM-as-a-judge as a proxy for human judgment.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 7 is a dedicated 'Limitations and Generalizability' section with substantial discussion spanning a full page.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 7 discusses specific threats: reliance on LLM-as-a-judge for milestone evaluation, restriction to orchestrator-level text interventions, requirement for checkpoint/replay infrastructure, and limitation to sequential orchestration topologies.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section 7 explicitly states what was NOT tested: 'long-running production workloads, domains with strict latency or cost constraints, or settings with safety-critical requirements', asynchronous orchestrators, and sub-agent code modification.",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding sources are disclosed. The first author is an intern at Microsoft and other authors are Microsoft employees, but no funding acknowledgment section exists.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations clearly list Microsoft and Chinese Academy of Sciences. The footnote notes 'Work is done during an internship at Microsoft.'",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The work is done at Microsoft, which develops the Magentic-One and AutoGen2 frameworks being evaluated. Microsoft has a commercial interest in demonstrating these frameworks can be improved.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is provided in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms are defined precisely: 'failure' (Section 1: execution without interruption yet incorrect results), 'trial' (Section 4: planning step plus corresponding execution steps), and intervention types (modified sub-agent instructions vs. plan updates).",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 1 lists three explicit contributions: (i) DoVer framework, (ii) analysis of ground-truth annotation challenges in log-based attribution, (iii) experimental demonstration of failure recovery and hypothesis validation.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 provides substantive related work covering failure analysis (MAST, TRAIL), attribution methods (WW), and debugging tools (AGDebugger, LangGraph), positioning DoVer's intervention-based approach relative to existing log-only methods.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "The paper states 'Project website and code will be available at https://aka.ms/DoVer' — a promise of future release, not a current release.",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The datasets used (GAIA, AssistantBench, GSMPlus, WW) are publicly available benchmarks. The paper builds on these standard public datasets.",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No environment specifications, requirements files, or dependency details are provided in the paper.",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions are provided. The paper describes the pipeline conceptually but lacks executable reproduction guidance.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Main results in Tables 2 and 3 report point estimates (e.g., '17.6%', '27.5%') without confidence intervals or error bars. Table 5 includes ± notation for reproduction experiments but the core DoVer results lack uncertainty quantification.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are used when comparing DoVer's performance across settings or against baselines (Self-Refine, CRITIC). Differences are asserted by comparing raw percentages.",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "The paper reports percentage improvements with baseline context: e.g., '17.6% trial success rate' on WW vs '27.5%' on GAIA-Level-1, '49% flip rate' on GSMPlus, and '+15.7% progress made'. These provide magnitude context.",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No justification for the sample sizes used. The number of failed cases (26 per dataset for M1) is determined by the benchmark and system performance, not by power analysis or design choice.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "The paper states 'we perform three independent runs for each intervention' but does not report variance across these runs. Only aggregate success rates are shown in Tables 2-3.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Section 5.3 compares DoVer against Self-Refine-style and CRITIC-style baselines, both achieving 0% recovery vs DoVer's 17.6%.",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Self-Refine (Madaan et al., 2023) and CRITIC (Gou et al., 2023) are recent and relevant self-improvement methods. The paper also discusses contemporaneous works in Section 2.",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Section 5.3 includes ablations on different DoVer underlying models (GPT-4o, Qwen3-8B, Qwen3-32B) and few-shot prompting effects.",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "The paper uses Trial Success Rate, Progress Made (milestone-based), and hypothesis validation categories (Validated/Partially Validated/Refuted/Inconclusive).",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "Milestone progress and 'intervention fulfilled' assessments rely on LLM-as-a-judge (GPT-5). The paper acknowledges this limitation in Section 7 but does not include human evaluation.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "The paper uses standard benchmark validation/test splits (GAIA Level-1 validation set, WW hand-crafted cases, GSMPlus testmini split).",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down by dataset (WW-AB, WW-GAIA, GAIA-Level-1, GSMPlus) and by hypothesis validation category (Tables 2-3). Table 4 shows per-model breakdowns.",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section 5.4 presents detailed qualitative case studies for Refuted and Inconclusive outcomes, and Appendix D covers Validated and Partially Validated cases.",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "The paper reports that WW-AB achieves '+0% progress made' after intervention, and that 29-67% of cases are Inconclusive. Section 5.5 analyzes why interventions fail.",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Footnote 3 specifies 'GPT-4o-20241120' and 'GPT-5-chat-20250807'. Open-source models specified as 'Qwen3-8B' and 'Qwen3-32B in thinking mode'.",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Full prompt text is provided in Appendix B: Figures 5-11 show the trial segmenter, failure proposer, intervention recommender, milestone extractor, milestone evaluator, and post-intervention classifier prompts.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Footnote 3 states 'All LLM API calls are made through Azure OpenAI using default parameters' but does not specify what those defaults are (temperature, top-p, max tokens).",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "The DoVer pipeline is described in detail (Section 4): trial segmentation, failure attribution, intervention generation, intervention execution with checkpoint/replay. The Magentic-One and AG2 frameworks are described including their orchestration patterns.",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Section 5.1 describes the failure trace collection process, how datasets were constructed (e.g., excluding WW cases from GAIA-Level-1), and Table 1 documents the filtering from total cases to failed cases to intervened cases.",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "The failure traces, intervention logs, and checkpoint data are not released. Only aggregate results in tables are provided.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Section 5.1 describes failure trace collection: initial run over all cases, evaluation of outcomes to identify failures, with GPT-4o generating and powering traces. Success rate matches reported M1 numbers.",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants. Data comes from standard benchmarks (GAIA, AssistantBench, GSMPlus).",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "Table 1 documents the pipeline: total cases → failed cases → intervened cases → intervened trials, with counts at each stage. The filtering from failed to intervened is explained (LLMs may conclude no mistake occurred).",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "The paper does not state the training data cutoff dates for GPT-4o, GPT-5, or Qwen3 models used.",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether GAIA, AssistantBench, or GSMPlus benchmark problems appeared in the training data of the models used.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "GAIA and AssistantBench were published in 2024, and the models (GPT-4o/GPT-5) may have been trained on data including these benchmarks. This is not discussed.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No API costs, tokens consumed, or wall-clock times are reported despite the pipeline making many LLM calls (trial segmentation + failure attribution + intervention generation + re-execution + evaluation, each repeated 3 times).",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total computational budget is stated. The paper runs thousands of GPT-4o and GPT-5 API calls across all experiments without quantifying cost or compute.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "The paper performs 3 runs per intervention but does not report sensitivity to random seeds or show per-seed variation. Only aggregate success rates across all runs are reported.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": true,
    382           "justification": "Section 4.2 explicitly states 'we perform three independent runs for each intervention' to reduce impact of LLM stochasticity.",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "No hyperparameter search budget is reported. Prompt designs appear manually crafted but the number of iterations or alternatives tried is not stated.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "The prompt refinements in Section 3 (step indices + guidance) are presented as iterative improvements but the selection process for the final configuration is not systematically justified.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": false,
    399           "answer": false,
    400           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "Microsoft authors evaluate their own DoVer framework on Microsoft's Magentic-One and AutoGen2 systems. The bias of evaluating their own system is not discussed.",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "DoVer requires multiple LLM calls per trial (segmentation + attribution + intervention + re-execution + evaluation × 3 runs), which is substantially more compute than the baselines. This compute difference is not quantified or discussed.",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": true,
    418           "justification": "Section 3 provides substantial analysis of why log-based failure attribution metrics (step accuracy) have poor construct validity due to ground-truth uncertainty, motivating the shift to outcome-based metrics. The paper questions what the benchmarks actually measure.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": true,
    423           "answer": true,
    424           "justification": "The paper explicitly tests DoVer across two different agent frameworks (Magentic-One and AG2) and different underlying models (GPT-4o, Qwen3-8B, Qwen3-32B), separating the scaffold variable from the model variable.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "No discussion of whether GPT-4o/GPT-5 training data includes GAIA, AssistantBench, or GSMPlus benchmark solutions, despite these being published before model training cutoffs.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "The intervention generation stage receives the ground-truth answer as input (visible in Figure 8 prompt). While this is by design for debugging, the paper does not discuss how this ground-truth leakage affects the evaluation's ecological validity.",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No discussion of independence between training data and test benchmarks.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No leakage detection or prevention methods are applied.",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "DoVer recovers 18-28% of failed trials on GAIA/AssistantBench via Magentic-One",
    459       "evidence": "Table 2: WW-AB 17.6%, WW-GAIA 17.6%, GAIA-Level-1 27.5% trial success rates after intervention across 234 intervened trials",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "DoVer recovers 49% of failed trials on GSMPlus via AutoGen2, demonstrating cross-framework generalizability",
    464       "evidence": "Table 2: GSMPlus 49.0% trial success rate over 198 intervened trials using AG2-based MathChat",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Log-based failure attribution has fundamental ground-truth uncertainty: 48% of GAIA cases in WW are ambiguous",
    469       "evidence": "Section 3: authors' independent re-annotation identifies 14 of 29 GAIA cases as uncertain due to multi-trial ambiguity, inter-agent misalignment, and cross-annotator disagreement",
    470       "supported": "moderate"
    471     },
    472     {
    473       "claim": "Simple prompt refinements (step indices + guidance) improve step attribution accuracy from 6% to 24%",
    474       "evidence": "Table 5: Hand Crafted step-level accuracy for GPT-4o: Baseline 6.04% → +Step Index 20.69% → +Guidance 23.56%",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "Self-Refine and CRITIC style baselines fail entirely (0% recovery) on multi-agent failure repair",
    479       "evidence": "Section 5.3: 'Across all 26 failed WW-GAIA cases, neither baseline is able to flip any failure into success (0% recovery)'",
    480       "supported": "strong"
    481     },
    482     {
    483       "claim": "Open-source models can approach proprietary model performance for DoVer (Qwen3-32B: 16.9% vs GPT-4o: 17.6%)",
    484       "evidence": "Table 4: Qwen3-32B achieves 16.9% trial success rate vs GPT-4o 17.6% on WW-GAIA setting",
    485       "supported": "strong"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval",
    490     "case-study",
    491     "ablation"
    492   ],
    493   "key_findings": "DoVer is an intervention-driven debugging framework that repairs 18-49% of failed LLM multi-agent system trials by treating failure attribution as a hypothesis to be verified through targeted execution interventions rather than accepted as ground truth. The paper demonstrates that log-based failure attribution has fundamental ambiguity (48% of examined GAIA cases have uncertain ground truth), and that reformulating debugging as 'do-then-verify' substantially outperforms self-improvement baselines (17.6% vs 0% recovery on WW-GAIA). The framework generalizes across two agent architectures (Magentic-One and AutoGen2), and open-source models (Qwen3-32B) approach proprietary GPT-4o performance. However, 29-67% of trials remain inconclusive due to sub-agent capability gaps that orchestrator-level interventions cannot address, revealing the boundaries of automated debugging without system-level modifications.",
    494   "red_flags": [
    495     {
    496       "flag": "Small sample sizes, no significance tests",
    497       "detail": "Main GAIA/AssistantBench results are based on 26-29 cases per dataset with no power analysis; no statistical significance tests accompany any comparative claim despite these small samples."
    498     },
    499     {
    500       "flag": "Microsoft evaluating Microsoft-built tools",
    501       "detail": "Microsoft-affiliated authors evaluate DoVer built on Magentic-One (Microsoft) using GPT-4o via Azure OpenAI (Microsoft); no conflict-of-interest disclosure is made."
    502     },
    503     {
    504       "flag": "Code not released at publication",
    505       "detail": "Code is promised for future release but not available; results cannot be independently reproduced without the checkpoint/replay infrastructure described in ~1,000 lines of code."
    506     },
    507     {
    508       "flag": "LLM-as-judge for key evaluation metrics",
    509       "detail": "Milestone progress evaluation and intervention fulfillment classification both rely on LLM-as-judge; the paper acknowledges this 'may introduce biases despite careful prompt design' but does not quantify the bias."
    510     },
    511     {
    512       "flag": "High inconclusive rates undermine debugging utility",
    513       "detail": "29-67% of trials are classified 'inconclusive' (the agent failed to execute the intervention), meaning DoVer cannot determine if hypotheses are correct or wrong for the majority of cases in the hardest settings."
    514     },
    515     {
    516       "flag": "Benchmark contamination not addressed",
    517       "detail": "GPT-5 (training through ~2025) is evaluated on GAIA and AssistantBench (published 2024); potential memorization of benchmark tasks by the underlying model is never discussed."
    518     },
    519     {
    520       "flag": "Hyperparameters not reported",
    521       "detail": "'Default parameters' for Azure OpenAI calls is insufficient — temperature and other decoding parameters affecting reproducibility and stochasticity are unspecified."
    522     }
    523   ],
    524   "cited_papers": [
    525     {
    526       "title": "Why Do Multi-Agent LLM Systems Fail? (MAST)",
    527       "relevance": "Provides failure taxonomy for multi-agent systems that DoVer builds on for intervention categorization; also provides MathChat prompts used in AG2 experiments"
    528     },
    529     {
    530       "title": "Which Agent Causes Task Failures and When? On Automated Failure Attribution of LLM Multi-Agent Systems (Who&When)",
    531       "relevance": "Direct predecessor providing the log-based attribution benchmark and WW dataset that DoVer analyzes, critiques, and improves upon"
    532     },
    533     {
    534       "title": "GAIA: A Benchmark for General AI Assistants",
    535       "relevance": "Primary evaluation benchmark for web-based multi-agent task debugging experiments across multiple difficulty levels"
    536     },
    537     {
    538       "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks",
    539       "relevance": "Primary agent framework in which DoVer is implemented and evaluated; CheckPoint/replay infrastructure built on top of it"
    540     },
    541     {
    542       "title": "Interactive Debugging and Steering of Multi-Agent AI Systems (AGDebugger)",
    543       "relevance": "Human-in-the-loop debugging tool that DoVer automates; the M1 integration adapts AGDebugger's checkpointing interface"
    544     },
    545     {
    546       "title": "TRAIL: Trace Reasoning and Agentic Issue Localization",
    547       "relevance": "Concurrent work providing turn-level failure taxonomy and empirical evidence that long-context models struggle at trace debugging"
    548     },
    549     {
    550       "title": "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?",
    551       "relevance": "One of the two primary evaluation datasets for multi-agent failure debugging experiments"
    552     },
    553     {
    554       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    555       "relevance": "Self-improvement baseline compared against DoVer in ablation study; shown to achieve 0% recovery on multi-agent failure repair"
    556     }
    557   ],
    558   "engagement_factors": {
    559     "practical_relevance": {
    560       "score": 3,
    561       "justification": "Directly addresses a real practitioner pain point (debugging opaque multi-agent failures at scale) with a concrete tool, integration guide, and web UI."
    562     },
    563     "surprise_contrarian": {
    564       "score": 2,
    565       "justification": "Challenges the prevailing log-based attribution paradigm by showing ground-truth labels are unreliable in nearly half of examined cases and that self-improvement baselines completely fail."
    566     },
    567     "fear_safety": {
    568       "score": 0,
    569       "justification": "The paper focuses on debugging reliability of agentic systems; no AI safety risks are raised."
    570     },
    571     "drama_conflict": {
    572       "score": 1,
    573       "justification": "Critiques the Who&When benchmark's ground-truth annotations as fundamentally unreliable, a mild controversy within the multi-agent debugging community."
    574     },
    575     "demo_ability": {
    576       "score": 2,
    577       "justification": "A web-based intervention UI for AG2 MathChat is shown (Figure 4) and the framework is demonstrated on real benchmark failures with concrete qualitative case studies."
    578     },
    579     "brand_recognition": {
    580       "score": 2,
    581       "justification": "Microsoft Research affiliation with use of GPT-4o and GPT-5 (OpenAI/Azure) and Magentic-One (Microsoft) gives the work institutional visibility."
    582     }
    583   },
    584   "hn_data": {
    585     "threads": [
    586       {
    587         "hn_id": "42378335",
    588         "title": "Training LLMs to Reason in a Continuous Latent Space",
    589         "points": 283,
    590         "comments": 114,
    591         "url": "https://news.ycombinator.com/item?id=42378335",
    592         "created_at": "2024-12-10T16:26:17Z"
    593       },
    594       {
    595         "hn_id": "43042753",
    596         "title": "LM2: Large Memory Models",
    597         "points": 110,
    598         "comments": 30,
    599         "url": "https://news.ycombinator.com/item?id=43042753",
    600         "created_at": "2025-02-13T23:21:21Z"
    601       },
    602       {
    603         "hn_id": "29568816",
    604         "title": "Proof of Steak",
    605         "points": 79,
    606         "comments": 28,
    607         "url": "https://news.ycombinator.com/item?id=29568816",
    608         "created_at": "2021-12-15T17:16:25Z"
    609       },
    610       {
    611         "hn_id": "30078848",
    612         "title": "Phishing in organizations: Findings from a large-scale and long-term study",
    613         "points": 30,
    614         "comments": 10,
    615         "url": "https://news.ycombinator.com/item?id=30078848",
    616         "created_at": "2022-01-25T22:11:11Z"
    617       },
    618       {
    619         "hn_id": "42456288",
    620         "title": "Rethinking the Combination of Graph Neural Network and Large Language Model",
    621         "points": 2,
    622         "comments": 0,
    623         "url": "https://news.ycombinator.com/item?id=42456288",
    624         "created_at": "2024-12-18T22:41:39Z"
    625       },
    626       {
    627         "hn_id": "38762672",
    628         "title": "Building Trustworthy NeuroSymbolic AI Systems",
    629         "points": 2,
    630         "comments": 0,
    631         "url": "https://news.ycombinator.com/item?id=38762672",
    632         "created_at": "2023-12-25T14:04:27Z"
    633       },
    634       {
    635         "hn_id": "29485809",
    636         "title": "Deep learning for elliptic and parabolic boundary value problems",
    637         "points": 2,
    638         "comments": 0,
    639         "url": "https://news.ycombinator.com/item?id=29485809",
    640         "created_at": "2021-12-08T15:22:21Z"
    641       },
    642       {
    643         "hn_id": "42470646",
    644         "title": "SpikeFI: A Fault Injection Framework for Spiking Neural Networks",
    645         "points": 1,
    646         "comments": 0,
    647         "url": "https://news.ycombinator.com/item?id=42470646",
    648         "created_at": "2024-12-20T12:47:13Z"
    649       }
    650     ],
    651     "top_points": 283,
    652     "total_points": 509,
    653     "total_comments": 182
    654   }
    655 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs