scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27112B)
      1 {
      2   "paper": {
      3     "title": "DoVer: Intervention-Driven Auto Debugging for LLM Multi-Agent Systems",
      4     "authors": ["Ming Ma", "Jue Zhang", "Fangkai Yang", "Yu Kang", "Qingwei Lin", "Saravan Rajmohan", "Dongmei Zhang"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2512.06749",
      8     "doi": "10.48550/arXiv.2512.06749"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "DoVer, an intervention-driven debugging framework for LLM multi-agent systems, recovers 18-28% of failed trials on GAIA/AssistantBench within Magentic-One and 49% on GSMPlus within AG2. The framework validates or refutes 30-60% of failure hypotheses. Log-based single-step failure attribution suffers from fundamental ground-truth annotation uncertainty, with 14 of 29 GAIA cases exhibiting uncertain labels. Open-source models (Qwen3-8B/32B) can power DoVer with narrowing gaps to GPT-4o.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper states 'Project website and code will be available at https://aka.ms/DoVer' — a promise of future release, not a current release."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The datasets used (GAIA, AssistantBench, GSMPlus, WW) are publicly available benchmarks. The paper builds on these standard public datasets."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements files, or dependency details are provided in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The paper describes the pipeline conceptually but lacks executable reproduction guidance."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Main results in Tables 2 and 3 report point estimates (e.g., '17.6%', '27.5%') without confidence intervals or error bars. Table 5 includes ± notation for reproduction experiments but the core DoVer results lack uncertainty quantification."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used when comparing DoVer's performance across settings or against baselines (Self-Refine, CRITIC). Differences are asserted by comparing raw percentages."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context: e.g., '17.6% trial success rate' on WW vs '27.5%' on GAIA-Level-1, '49% flip rate' on GSMPlus, and '+15.7% progress made'. These provide magnitude context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for the sample sizes used. The number of failed cases (26 per dataset for M1) is determined by the benchmark and system performance, not by power analysis or design choice."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper states 'we perform three independent runs for each intervention' but does not report variance across these runs. Only aggregate success rates are shown in Tables 2-3."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 5.3 compares DoVer against Self-Refine-style and CRITIC-style baselines, both achieving 0% recovery vs DoVer's 17.6%."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Self-Refine (Madaan et al., 2023) and CRITIC (Gou et al., 2023) are recent and relevant self-improvement methods. The paper also discusses contemporaneous works in Section 2."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5.3 includes ablations on different DoVer underlying models (GPT-4o, Qwen3-8B, Qwen3-32B) and few-shot prompting effects."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses Trial Success Rate, Progress Made (milestone-based), and hypothesis validation categories (Validated/Partially Validated/Refuted/Inconclusive)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Milestone progress and 'intervention fulfilled' assessments rely on LLM-as-a-judge (GPT-5). The paper acknowledges this limitation in Section 7 but does not include human evaluation."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses standard benchmark validation/test splits (GAIA Level-1 validation set, WW hand-crafted cases, GSMPlus testmini split)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by dataset (WW-AB, WW-GAIA, GAIA-Level-1, GSMPlus) and by hypothesis validation category (Tables 2-3). Table 4 shows per-model breakdowns."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.4 presents detailed qualitative case studies for Refuted and Inconclusive outcomes, and Appendix D covers Validated and Partially Validated cases."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that WW-AB achieves '+0% progress made' after intervention, and that 29-67% of cases are Inconclusive. Section 5.5 analyzes why interventions fail."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims ('flips 18-28% of failed trials', 'up to 16% milestone progress', 'validates or refutes 30-60%', '49% on GSMPlus') are all directly supported by Tables 2 and 3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The intervention framework is inherently causal — it edits specific steps and re-executes to measure the effect. The ablation design (varying model, few-shot) uses controlled single-variable manipulation. The paper's core design IS a causal methodology for debugging."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 7 (Limitations and Generalizability) explicitly bounds the scope: two agent frameworks, specific task types (web-based info seeking, math), sequential orchestration, and specific model families. States results are 'evidence of feasibility rather than universal guarantees.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses LLM stochasticity (mitigated by 3 runs), tool limitations as alternative explanations for Inconclusive cases (Section 5.5), and that intervention failures may reflect sub-agent capability gaps rather than wrong hypotheses."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly distinguishes between Trial Success Rate (binary task completion) and Progress Made (milestone-based partial credit), and acknowledges in Section 4.2 that milestone evaluation relies on LLM-as-a-judge as a proxy for human judgment."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Footnote 3 specifies 'GPT-4o-20241120' and 'GPT-5-chat-20250807'. Open-source models specified as 'Qwen3-8B' and 'Qwen3-32B in thinking mode'."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text is provided in Appendix B: Figures 5-11 show the trial segmenter, failure proposer, intervention recommender, milestone extractor, milestone evaluator, and post-intervention classifier prompts."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Footnote 3 states 'All LLM API calls are made through Azure OpenAI using default parameters' but does not specify what those defaults are (temperature, top-p, max tokens)."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The DoVer pipeline is described in detail (Section 4): trial segmentation, failure attribution, intervention generation, intervention execution with checkpoint/replay. The Magentic-One and AG2 frameworks are described including their orchestration patterns."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5.1 describes the failure trace collection process, how datasets were constructed (e.g., excluding WW cases from GAIA-Level-1), and Table 1 documents the filtering from total cases to failed cases to intervened cases."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 is a dedicated 'Limitations and Generalizability' section with substantial discussion spanning a full page."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7 discusses specific threats: reliance on LLM-as-a-judge for milestone evaluation, restriction to orchestrator-level text interventions, requirement for checkpoint/replay infrastructure, and limitation to sequential orchestration topologies."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7 explicitly states what was NOT tested: 'long-running production workloads, domains with strict latency or cost constraints, or settings with safety-critical requirements', asynchronous orchestrators, and sub-agent code modification."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The failure traces, intervention logs, and checkpoint data are not released. Only aggregate results in tables are provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.1 describes failure trace collection: initial run over all cases, evaluation of outcomes to identify failures, with GPT-4o generating and powering traces. Success rate matches reported M1 numbers."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard benchmarks (GAIA, AssistantBench, GSMPlus)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Table 1 documents the pipeline: total cases → failed cases → intervened cases → intervened trials, with counts at each stage. The filtering from failed to intervened is explained (LLMs may conclude no mistake occurred)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources are disclosed. The first author is an intern at Microsoft and other authors are Microsoft employees, but no funding acknowledgment section exists."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly list Microsoft and Chinese Academy of Sciences. The footnote notes 'Work is done during an internship at Microsoft.'"
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "The work is done at Microsoft, which develops the Magentic-One and AutoGen2 frameworks being evaluated. Microsoft has a commercial interest in demonstrating these frameworks can be improved."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not state the training data cutoff dates for GPT-4o, GPT-5, or Qwen3 models used."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether GAIA, AssistantBench, or GSMPlus benchmark problems appeared in the training data of the models used."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "GAIA and AssistantBench were published in 2024, and the models (GPT-4o/GPT-5) may have been trained on data including these benchmarks. This is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs, tokens consumed, or wall-clock times are reported despite the pipeline making many LLM calls (trial segmentation + failure attribution + intervention generation + re-execution + evaluation, each repeated 3 times)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is stated. The paper runs thousands of GPT-4o and GPT-5 API calls across all experiments without quantifying cost or compute."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper performs 3 runs per intervention but does not report sensitivity to random seeds or show per-seed variation. Only aggregate success rates across all runs are reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.2 explicitly states 'we perform three independent runs for each intervention' to reduce impact of LLM stochasticity."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. Prompt designs appear manually crafted but the number of iterations or alternatives tried is not stated."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The prompt refinements in Section 3 (step indices + guidance) are presented as iterative improvements but the selection process for the final configuration is not systematically justified."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Microsoft authors evaluate their own DoVer framework on Microsoft's Magentic-One and AutoGen2 systems. The bias of evaluating their own system is not discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "DoVer requires multiple LLM calls per trial (segmentation + attribution + intervention + re-execution + evaluation × 3 runs), which is substantially more compute than the baselines. This compute difference is not quantified or discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 3 provides substantial analysis of why log-based failure attribution metrics (step accuracy) have poor construct validity due to ground-truth uncertainty, motivating the shift to outcome-based metrics. The paper questions what the benchmarks actually measure."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "The paper explicitly tests DoVer across two different agent frameworks (Magentic-One and AG2) and different underlying models (GPT-4o, Qwen3-8B, Qwen3-32B), separating the scaffold variable from the model variable."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether GPT-4o/GPT-5 training data includes GAIA, AssistantBench, or GSMPlus benchmark solutions, despite these being published before model training cutoffs."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The intervention generation stage receives the ground-truth answer as input (visible in Figure 8 prompt). While this is by design for debugging, the paper does not discuss how this ground-truth leakage affects the evaluation's ecological validity."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training data and test benchmarks."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "DoVer flips 18-28% of failed trials into successes on GAIA/AssistantBench within Magentic-One.",
    365       "evidence": "Table 2 shows Trial Success Rate of 17.6% for WW-AB, 17.6% for WW-GAIA, and 27.5% for GAIA-Level-1.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "DoVer recovers 49% of failed trials on GSMPlus within AG2.",
    370       "evidence": "Table 2 shows 49.0% Trial Success Rate on 198 intervened trials in GSMPlus setting.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "DoVer validates or refutes 30-60% of failure hypotheses.",
    375       "evidence": "Table 3 shows Validated+Refuted rates: 29.2% for WW-AB, 37.4% for WW-GAIA, 71.4% for GAIA-Level-1. The 30-60% range is conservative; GAIA-Level-1 exceeds 60%.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Ground-truth uncertainty substantially affects model attribution performance, with 14 of 29 GAIA cases exhibiting uncertain labels.",
    380       "evidence": "Section 3 and Table 7 document the re-annotation of 29 GAIA cases, finding 14 uncertain. Step accuracy is 24% (GPT-4o) / 7% (GPT-5) on uncertain cases vs 44% / 53% on certain cases.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Self-Refine and CRITIC baselines fail to recover any failures (0% recovery) in the WW-GAIA setting.",
    385       "evidence": "Section 5.3 states 'neither baseline is able to flip any failure into success (0% recovery)' across all 26 failed WW-GAIA cases.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Open-source models (Qwen3-8B/32B) can power DoVer with narrowing gaps to GPT-4o.",
    390       "evidence": "Table 4 shows Qwen3-8B at 11.3% (0-shot) / 14.3% (3-shot) and Qwen3-32B at 16.9% vs GPT-4o at 17.6% trial success rate.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating its own frameworks",
    397       "detail": "Microsoft authors evaluate DoVer on Microsoft's Magentic-One and AutoGen2 frameworks. While testing on two frameworks adds validity, there is an inherent conflict of interest in showing these frameworks can be improved by another Microsoft-developed tool."
    398     },
    399     {
    400       "flag": "Ground-truth answer provided to intervention generator",
    401       "detail": "The intervention recommender prompt (Figure 8) receives the ground-truth answer as input. While intended for debugging, this means the system has access to the correct answer when generating interventions, which may inflate success rates compared to a realistic deployment scenario."
    402     },
    403     {
    404       "flag": "No cost reporting despite expensive pipeline",
    405       "detail": "DoVer requires multiple LLM calls per trial (segmentation + attribution + intervention + 3× re-execution + evaluation), which is substantially more compute than the baselines. Without cost reporting, the practical viability cannot be assessed."
    406     },
    407     {
    408       "flag": "Small sample sizes for some settings",
    409       "detail": "WW-AB (23 intervened cases), WW-GAIA (25), and GAIA-Level-1 (25) are small samples. Point estimates like '17.6% trial success rate' on these sizes have wide uncertainty that is not quantified."
    410     },
    411     {
    412       "flag": "LLM-as-a-judge for key metrics",
    413       "detail": "Both the Progress Made metric (milestone evaluation) and the 'intervention fulfilled' metric rely on GPT-5 as a judge. The paper acknowledges this limitation but does not validate the LLM judge against human ratings."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Why do multi-agent LLM systems fail?",
    419       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    420       "year": 2025,
    421       "arxiv_id": "2503.13657",
    422       "relevance": "Catalogs failure patterns in multi-agent LLM systems across task interpretation, planning, tool interaction, and verification."
    423     },
    424     {
    425       "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems",
    426       "authors": ["Shaokun Zhang", "Ming Yin", "Jieyu Zhang"],
    427       "year": 2025,
    428       "relevance": "Introduces the Who&When (WW) dataset and single-agent/step failure attribution methodology for multi-agent systems."
    429     },
    430     {
    431       "title": "TRAIL: Trace reasoning and agentic issue localization",
    432       "authors": ["Darshan Deshpande", "Varun Gangal"],
    433       "year": 2025,
    434       "arxiv_id": "2505.08638",
    435       "relevance": "Creates turn-level traces with fine-grained taxonomy for debugging agent systems, showing strong models struggle at trace debugging."
    436     },
    437     {
    438       "title": "Magentic-one: A generalist multi-agent system for solving complex tasks",
    439       "authors": ["Adam Fourney", "Gagan Bansal"],
    440       "year": 2024,
    441       "arxiv_id": "2411.04468",
    442       "relevance": "Primary multi-agent framework used for DoVer experiments; centrally-orchestrated multi-agent architecture."
    443     },
    444     {
    445       "title": "Interactive debugging and steering of multi-agent AI systems",
    446       "authors": ["Will Epperson", "Gagan Bansal"],
    447       "year": 2025,
    448       "doi": "10.1145/3706598.3713581",
    449       "relevance": "AGDebugger: human-in-the-loop debugging tool for multi-agent systems with rewind/edit/re-execute capabilities."
    450     },
    451     {
    452       "title": "Agent-as-a-judge: Evaluate agents with agents",
    453       "authors": ["Mingchen Zhuge", "Changsheng Zhao"],
    454       "year": 2024,
    455       "arxiv_id": "2410.10934",
    456       "relevance": "Evaluation framework for agent systems using agents as evaluators, relevant to LLM-as-a-judge methodology."
    457     },
    458     {
    459       "title": "GAIA: a benchmark for general AI assistants",
    460       "authors": ["Grégoire Mialon", "Clémentine Fourrier"],
    461       "year": 2024,
    462       "relevance": "Primary benchmark used for DoVer evaluation; tests general AI assistant capabilities across multiple difficulty levels."
    463     },
    464     {
    465       "title": "AssistantBench: Can web agents solve realistic and time-consuming tasks?",
    466       "authors": ["Ori Yoran", "Samuel Joseph Amouyal"],
    467       "year": 2024,
    468       "doi": "10.18653/V1/2024.EMNLP-MAIN.505",
    469       "relevance": "Benchmark for realistic web agent tasks, used as second evaluation dataset for DoVer."
    470     },
    471     {
    472       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation framework",
    473       "authors": ["Qingyun Wu", "Gagan Bansal"],
    474       "year": 2023,
    475       "arxiv_id": "2308.08155",
    476       "relevance": "Multi-agent conversation framework used as second agent framework for DoVer generality evaluation."
    477     },
    478     {
    479       "title": "Self-refine: Iterative refinement with self-feedback",
    480       "authors": ["Aman Madaan", "Niket Tandon"],
    481       "year": 2023,
    482       "relevance": "Self-improvement baseline method compared against DoVer; achieves 0% recovery on multi-agent failures."
    483     },
    484     {
    485       "title": "Aegis: Automated error generation and attribution for multi-agent systems",
    486       "authors": ["Fanqi Kong", "Ruijie Zhang"],
    487       "year": 2025,
    488       "arxiv_id": "2509.14295",
    489       "relevance": "Specialized failure-tracer model for multi-agent systems; orthogonal approach that could strengthen DoVer's attribution stage."
    490     },
    491     {
    492       "title": "Can agents fix agent issues?",
    493       "authors": ["Alfin Wijaya Rahardja", "Junwei Liu"],
    494       "year": 2025,
    495       "arxiv_id": "2505.20749",
    496       "relevance": "AgentIssue-Bench: benchmark for agent-based repair of real agent system issues, finding low resolution rates."
    497     },
    498     {
    499       "title": "Evaluating agent-based program repair at Google",
    500       "authors": ["Pat Rondon", "Renyao Wei"],
    501       "year": 2025,
    502       "doi": "10.1109/ICSE-SEIP66354.2025.00038",
    503       "relevance": "Industrial evaluation of agent-based program repair on production bugs at Google scale."
    504     }
    505   ]
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs