scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28712B)
      1 {
      2   "paper": {
      3     "title": "MAESTRO: Multi-Agent Evaluation Suite for Testing, Reliability, and Observability",
      4     "authors": [
      5       "Tie Ma",
      6       "Yixi Chen",
      7       "Vaastav Anand",
      8       "Alessandro Cornacchia",
      9       "Amândio R. Faustino",
     10       "Guanheng Liu",
     11       "Shan Zhang",
     12       "Hongbin Luo",
     13       "Suhaib A. Fahmy",
     14       "Zafar A. Qazi",
     15       "Marco Canini"
     16     ],
     17     "year": 2026,
     18     "venue": "arXiv pre-print",
     19     "arxiv_id": "2601.00481"
     20   },
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper provides a GitHub repository URL: https://github.com/sands-lab/maestro (stated in Section 1, end of introduction)."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses publicly available datasets (HotpotQA) and official example repositories from frameworks (ADK, AutoGen, LangGraph). The MAS instances are drawn from public sources. The paper states plans to release figures as a dataset in the appendix."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. The paper mentions specific model names and tools (Tavily, Google Search, psutil, OpenTelemetry) but does not provide a reproducible environment specification with library versions."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "While the architecture and integration modes are described conceptually (Section 3.1), no step-by-step reproduction instructions (e.g., commands to run, scripts to execute) are provided in the paper itself. The GitHub repository may contain these, but the paper does not."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Boxplots with interquartile ranges are used throughout (Figures 4, 5, 6, 7, 9, 10, 12). Median and IQR values are reported for key metrics (e.g., 'interquartile range 30.6–356.6 s' in Section 4.4)."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper makes comparative claims (e.g., 'CRAG consistently occupies the lower-cost and lower-latency region', 'Architecture dominates resource patterns') but uses no statistical significance tests (no p-values, t-tests, or bootstrap tests). Claims are based on comparing medians and visual inspection of distributions."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports effect sizes with baseline context throughout. For example: 'CRAG achieves a median cost of $0.0010 per task, which is more than an order of magnitude lower than both Plan-and-Execute (median $0.0126) and LATS (median $0.0101)' (Section 4.4). Also: 'median accuracy improvement of 35.7%' for CRAG with tools (Section 4.7)."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper states 'at least 20 independent runs' per MAS instance (Section 4.1) but provides no justification for why 20 runs is sufficient, no power analysis, and no acknowledgment of whether this sample size supports the comparative claims made."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Variance across runs is reported through boxplots showing full distributions (Figures 4, 5, 9, 10, 12), interquartile ranges (e.g., IQR 30.6–356.6 s for Plan-and-Execute in Section 4.4), and explicit discussion of run-to-run variance (e.g., Section 4.3 on call graph stability)."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper explicitly compares multiple MAS architectures against each other (CRAG, Plan-and-Execute, LATS) in the Architecture Suite (Section 4.4-4.7), and compares across 12 MAS instances in the Full Suite. Section 2.2 discusses limitations of existing benchmarks as motivation."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The MAS instances use contemporary frameworks (ADK, AutoGen, LangGraph, MCP-Agent) and contemporary models (Gemini-2.5-Flash, GPT-5-mini, GPT-5-nano, GPT-4o-mini). The benchmark references are from 2023-2025."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper systematically varies individual factors while controlling others: architecture (Section 4.4), model choice (Section 4.5), and tool usage on/off (Section 4.7). This constitutes ablation-style analysis isolating contributions of each factor."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are used: accuracy, cost (monetary), latency/task duration, CPU usage, memory usage, communication volume, Jaccard similarity, LCS similarity, and failure categorization (Table 3)."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "Evaluation of MAS outputs uses automated LLM-as-judge (gpt-4o-mini) for correctness. Section 4.6 explicitly discusses limitations of LLM-as-judge failure attribution, noting divergence across judge models, but no human evaluation of MAS outputs is included."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The paper does not describe a held-out test set. Inputs are drawn from official example repositories, public datasets (HotpotQA), and synthetic inputs (Section 4.1), but there is no separation of development and test splits. The same configurations appear to be used for both tuning and reporting."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by architecture (CRAG vs. Plan-and-Execute vs. LATS), by model (six different LLMs), and by tool configuration (with/without tools). Figures 9-12 provide per-architecture and per-model breakdowns. Table 3 provides per-category failure breakdown."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 4.6 is devoted to failure mode analysis. Table 3 categorizes failures (missing/underspecified output: 47.61%, wrong fact/entity: 27.66%, empty prediction: 15.96%, exception: 6.38%, timeout: 1.86%). Model-specific failure patterns are discussed in detail."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Multiple negative results are reported: 'Accuracy degrades with increasing architectural complexity' (Section 4.4), 'Upgrading the base LLM does not reliably reduce cost or improve accuracy' (Finding 6), Plan-and-Execute with tools loses median accuracy (Section 4.7), LATS shows marginal and unstable gains with tools."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims that 'MAS architecture is the dominant driver of resource profiles, reproducibility, and cost–latency–accuracy trade-off' are supported by Sections 4.2-4.7, and that 'MAS executions can be structurally stable yet temporally variable' is supported by Section 4.3 (Jaccard avg 0.86 vs LCS avg 0.65)."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper makes causal claims through controlled experiments that vary one factor at a time (architecture in Section 4.4, model in Section 4.5, tools in Section 4.7). The ablation design is adequate — single-variable manipulation with controlled baselines. Claims like 'tool integration mitigates speculative generation' are supported by trace-level evidence (Section 4.7)."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 5.1 explicitly states: 'the insights derived from our evaluation are necessarily grounded in the specific instances and configurations studied. As a result, some findings may not directly transfer to future MAS designs or to application domains not represented in our benchmark.' Findings consistently use qualifying language like 'in our setup', 'in our evaluation'."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper discusses alternative explanations for key findings. For example, Section 4.5 explores why stronger models don't improve accuracy (execution dynamics and variance amplification rather than model capacity). Section 4.7 provides trace-level analysis explaining when tools reduce cost (reduced speculative generation). Section 4.6 discusses LLM-as-judge disagreement as a confound."
    138       }
    139     },
    140     "setup_transparency": {
    141       "model_versions_specified": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper uses marketing names: 'Gemini-2.0-Flash-Lite', 'Gemini-2.5-Flash-Lite', 'Gemini-2.5-Flash', 'GPT-4o-mini', 'GPT-5-mini', 'GPT-5-nano', 'gpt-oss-120b'. No snapshot dates, API versions, or exact version identifiers are provided. Per the schema, marketing names without snapshot dates do NOT count."
    145       },
    146       "prompts_provided": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper describes using 'naive artifacts', 'public datasets', and 'synthetic inputs' (Section 4.1) but does not provide the actual prompts used for the MAS instances, the LLM-as-user simulation prompts, or the LLM-as-judge prompts. For human-in-the-loop simulation: 'a designated LLM generates replies conditioned on the MAS outputs' but the conditioning prompt is not shown."
    150       },
    151       "hyperparameters_reported": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "No temperature, top-p, or sampling settings are reported for any of the LLMs used (backbone models, LLM-as-user, or LLM-as-judge). The paper mentions a 10-minute timeout cap and 8,192 token limit for the architecture suite, but these are execution constraints, not LLM hyperparameters."
    155       },
    156       "scaffolding_described": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The paper describes the scaffolding architecture of each MAS instance: agent counts, interaction patterns (planning, coordination, debate, correction), tool configurations, and workflow structures. Figure 3 shows detailed architecture diagrams for the three Architecture Suite instances (CRAG, Plan-and-Execute, LATS). Table 2 summarizes all 12 instances."
    160       },
    161       "data_preprocessing_documented": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "Section 4.1 describes three input types (naive artifacts, public datasets, synthetic inputs) but does not document how inputs were selected from these sources, what filtering was applied, or how synthetic inputs were generated (beyond 'LLM-generated prompts'). The number of inputs per MAS instance is not clearly stated."
    165       }
    166     },
    167     "limitations_and_scope": {
    168       "limitations_section_present": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 5.1 is titled 'Limitation' and discusses generalizability and telemetry overhead as specific limitations."
    172       },
    173       "threats_to_validity_specific": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 5.1 discusses specific threats: 'the insights derived from our evaluation are necessarily grounded in the specific instances and configurations studied', 'advances in agent orchestration or model capabilities may invalidate certain observations over time', and telemetry overhead 'may degrade system performance'. These are specific to this study."
    177       },
    178       "scope_boundaries_stated": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The paper explicitly bounds scope: 'Currently, MAESTRO only supports the adjustment of a few parameters, such as model choice and tool usage' (footnote 1), 'MAESTRO supports only pre-defined MAS instances' (footnote 2). Section 5.1 states findings 'may not directly transfer to future MAS designs or to application domains not represented in our benchmark'. Section 5.2 lists several out-of-scope areas (distributed MAS, framework overhead, parallelism effects)."
    182       }
    183     },
    184     "data_integrity": {
    185       "raw_data_available": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper states plans to 'release them as a dataset' (Appendix A.1) for figures, but raw execution traces and telemetry data are not currently available. No data download link is provided beyond the code repository."
    189       },
    190       "data_collection_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 4.1 describes data collection: inputs from official example repositories, public datasets (HotpotQA), and synthetic LLM-generated inputs. At least 20 independent runs per MAS instance with specified model configurations (six models listed). The observation component (Section 3.1.1) describes what data is collected during execution."
    194       },
    195       "recruitment_methods_described": {
    196         "applies": false,
    197         "answer": false,
    198         "justification": "No human participants. The study evaluates MAS systems using automated execution and LLM-as-judge evaluation."
    199       },
    200       "data_pipeline_documented": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The pipeline is documented in Section 3.1.1: MAS instances preparation -> Configuration -> Runtime execution -> Observation (telemetry collection via OpenTelemetry and psutil) -> Post-processing (aggregation and analysis). Appendix A.1 details the post-processing outputs."
    204       }
    205     },
    206     "conflicts_of_interest": {
    207       "funding_disclosed": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "No funding or acknowledgments section is present in the paper. Author affiliations are listed (KAUST, Beihang University, MPI-SWS, LUMS) but no funding sources are disclosed."
    211       },
    212       "affiliations_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "All author affiliations are clearly listed: Beihang University, KAUST, MPI-SWS, LUMS. The footnote notes 'Work done while Tie Ma was interning at KAUST'."
    216       },
    217       "funder_independent_of_outcome": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding is disclosed. Without knowing the funder, independence cannot be assessed. The paper evaluates products from Google (Gemini, ADK), OpenAI (GPT models), and Microsoft (AutoGen) — if any of these funded the work, there would be a conflict."
    221       },
    222       "financial_interests_declared": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No competing interests or financial interests statement is present in the paper."
    226       }
    227     },
    228     "contamination": {
    229       "training_cutoff_stated": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "The paper evaluates LLM performance on tasks (including HotpotQA for question answering accuracy), but no training data cutoff dates are stated for any of the six models used."
    233       },
    234       "train_test_overlap_discussed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "HotpotQA (published 2018) is used as a benchmark for QA accuracy evaluation. The models tested were trained after 2018 and may have seen HotpotQA examples in training. This potential overlap is not discussed."
    238       },
    239       "benchmark_contamination_addressed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "HotpotQA has been publicly available since 2018, well before the training cutoff of all models used. The accuracy measurements (e.g., CRAG 70.6%, Plan-and-Execute 48.3%) could be inflated by contamination. This risk is not addressed."
    243       }
    244     },
    245     "human_studies": {
    246       "pre_registered": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants. The study evaluates automated MAS systems."
    250       },
    251       "irb_or_ethics_approval": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants."
    255       },
    256       "demographics_reported": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants."
    260       },
    261       "inclusion_exclusion_criteria": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "randomization_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "blinding_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "attrition_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       }
    281     },
    282     "cost_and_practicality": {
    283       "inference_cost_reported": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Monetary cost per task is reported extensively. For example: CRAG median $0.0010, Plan-and-Execute median $0.0126, LATS median $0.0101 (Section 4.4). Cost is a central metric throughout the evaluation, with per-model and per-architecture breakdowns in Figures 9 and 10."
    287       },
    288       "compute_budget_stated": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "While per-task CPU/memory usage is reported (Section 4.2), the total computational budget for the full evaluation campaign is not stated. The paper does not report total API spend, total GPU hours, or total number of runs conducted across all experiments."
    292       }
    293     }
    294   },
    295   "claims": [
    296     {
    297       "claim": "MAS executions are structurally stable (high Jaccard similarity avg 0.86) but temporally variable (moderate LCS similarity avg 0.65).",
    298       "evidence": "Section 4.3, Figure 7: Jaccard similarity averages 0.86 across all examples, while LCS similarity averages 0.65. CRAG and Tree-of-Thoughts show high Jaccard but low LCS scores.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "MAS architecture is the dominant driver of resource consumption profiles, outweighing model choice.",
    303       "evidence": "Section 4.2, Figures 4-6: CRAG exhibits 9.7% avg CPU and 405.3MB avg memory, while Plan-and-Execute uses 0.07% CPU. Model choice influences CPU but has negligible impact on memory.",
    304       "supported": "strong"
    305     },
    306     {
    307       "claim": "Upgrading the base LLM does not reliably reduce cost or improve accuracy in MAS.",
    308       "evidence": "Section 4.5, Figure 10: gpt-5-mini and gpt-5-nano have comparable mean cost (0.033 vs 0.043) despite size differences. GPT-4o-mini achieves lower median cost ($0.0034) than both 5-series models. Accuracy shows no monotonic relationship with model capability.",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "Specialized architectures (CRAG) simultaneously reduce resource consumption and maintain strong task performance compared to general-purpose architectures.",
    313       "evidence": "Section 4.4, Figure 9: CRAG achieves 70.6% average accuracy vs 48.3% for Plan-and-Execute, while CRAG's median cost ($0.0010) is over 10x lower than Plan-and-Execute ($0.0126).",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "MAS failures predominantly manifest as silent semantic errors (75.17% of all failures).",
    318       "evidence": "Section 4.6, Table 3: Silent semantic failures (missing/underspecified output 47.61% + wrong fact/entity 27.66%) constitute 75.17% of failures. Only 24.84% are explicit failures (exceptions + timeouts).",
    319       "supported": "strong"
    320     },
    321     {
    322       "claim": "External tools improve accuracy only when the agent architecture can integrate them without amplifying execution overhead.",
    323       "evidence": "Section 4.7, Figure 12: CRAG shows 35.7% median accuracy improvement with tools. Plan-and-Execute loses median accuracy with tools. LATS shows marginal gains (4.2% median improvement) with tools, positive in only half of runs.",
    324       "supported": "strong"
    325     },
    326     {
    327       "claim": "LLM-as-judge failure attribution exhibits substantial divergence across judge models.",
    328       "evidence": "Section 4.6, Figure 11b: Different judge models (Gemini-2.5-Flash, GPT-4o, GPT-OSS-120B) disagree on failure categorization for identical inputs. Offline judges cannot identify system-level failures (exceptions, timeouts) without runtime signals.",
    329       "supported": "moderate"
    330     }
    331   ],
    332   "methodology_tags": [
    333     "benchmark-eval"
    334   ],
    335   "key_findings": "MAESTRO is an open-source evaluation suite for LLM-based multi-agent systems that standardizes configuration, execution, and telemetry collection across heterogeneous MAS architectures. Evaluation of 12 MAS instances across 6 backbone models reveals that MAS architecture is the dominant factor determining resource profiles, reproducibility, and the cost-latency-accuracy trade-off, outweighing model and tool configuration choices. The study finds that upgrading backbone models does not reliably improve MAS accuracy or reduce cost, and that 75% of MAS failures are silent semantic errors invisible without runtime telemetry. Task-specialized architectures (e.g., CRAG) achieve both lower cost and higher accuracy than general-purpose architectures.",
    336   "red_flags": [
    337     {
    338       "flag": "No statistical significance tests for comparative claims",
    339       "detail": "The paper makes numerous comparative claims about architectures, models, and tool configurations being different or dominant, but uses no statistical tests. All comparisons are based on visual inspection of boxplots and comparison of medians. With only ~20 runs per configuration, stochastic variance could explain some observed differences."
    340     },
    341     {
    342       "flag": "Benchmark contamination unaddressed",
    343       "detail": "HotpotQA (published 2018) is used for accuracy evaluation of models trained well after 2018. Accuracy numbers (e.g., CRAG 70.6%) may be inflated by data contamination. This is particularly relevant since the paper's architecture-focused findings rely heavily on accuracy comparisons."
    344     },
    345     {
    346       "flag": "LLM-as-judge for correctness evaluation without human validation",
    347       "detail": "GPT-4o-mini is used as the sole evaluator for correctness, but Section 4.6 itself demonstrates that different LLM judges disagree substantially on failure attribution. No human validation of the judge's accuracy assessments is provided, introducing potential systematic bias into all accuracy-dependent findings."
    348     },
    349     {
    350       "flag": "Missing hyperparameter and prompt details",
    351       "detail": "No temperature, top-p, or sampling parameters are reported for any of the six backbone models, the LLM-as-user simulation, or the LLM-as-judge. Actual prompts are not provided. These omissions make exact reproduction impossible even with the released code."
    352     },
    353     {
    354       "flag": "Potential confound between architecture and task specialization",
    355       "detail": "The Architecture Suite finding that CRAG outperforms Plan-and-Execute and LATS may be confounded: CRAG is explicitly designed for retrieval-centric workloads and the evaluation uses HotpotQA (a retrieval-centric QA task). The paper acknowledges this (CRAG is 'optimized for retrieval-centric workloads') but still frames findings broadly about 'specialized vs. general architectures'."
    356     }
    357   ],
    358   "cited_papers": [
    359     {
    360       "title": "AgentBench: evaluating LLMs as agents",
    361       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    362       "year": 2023,
    363       "arxiv_id": "2308.03688",
    364       "relevance": "Major benchmark for evaluating LLM agent capabilities, directly relevant to evaluating agentic AI systems."
    365     },
    366     {
    367       "title": "Why do multi-agent LLM systems fail?",
    368       "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"],
    369       "year": 2025,
    370       "arxiv_id": "2503.13657",
    371       "relevance": "Taxonomy of failure modes in multi-agent LLM systems, directly relevant to reliability evaluation of agentic AI."
    372     },
    373     {
    374       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    375       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    376       "year": 2024,
    377       "relevance": "Foundational multi-agent framework widely used in agentic AI research."
    378     },
    379     {
    380       "title": "Magentic-One: A generalist multi-agent system for solving complex tasks",
    381       "authors": ["Adam Fourney", "Gagan Bansal", "Hussein Mozannar"],
    382       "year": 2024,
    383       "relevance": "Generalist multi-agent system from Microsoft, evaluated as a MAS instance in MAESTRO."
    384     },
    385     {
    386       "title": "Measuring agents in production",
    387       "authors": ["Melissa Z Pan", "Negar Arabzadeh", "Riccardo Cogo"],
    388       "year": 2025,
    389       "arxiv_id": "2512.04123",
    390       "relevance": "Survey of production MAS evaluation practices, reports 75% of teams evaluate without benchmarks."
    391     },
    392     {
    393       "title": "Beyond black-box benchmarking: observability, analytics, and optimization of agentic systems",
    394       "authors": ["Dany Moshkovich", "Hadar Mulian", "Sergey Zeltyn"],
    395       "year": 2025,
    396       "arxiv_id": "2503.06745",
    397       "relevance": "Observability-based benchmarking for agentic systems, directly related to MAESTRO's approach."
    398     },
    399     {
    400       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    401       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"],
    402       "year": 2023,
    403       "relevance": "Foundational work on agent self-reflection strategies, relevant to agentic AI design patterns."
    404     },
    405     {
    406       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    407       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    408       "year": 2023,
    409       "relevance": "Tree-of-Thoughts reasoning strategy evaluated as a MAS instance in MAESTRO."
    410     },
    411     {
    412       "title": "Language agent tree search unifies reasoning acting and planning in language models",
    413       "authors": ["Andy Zhou", "Kai Yan", "Michal Shlapentokh-Rothman"],
    414       "year": 2023,
    415       "arxiv_id": "2310.04406",
    416       "relevance": "LATS architecture evaluated as a key MAS instance in the Architecture Suite of MAESTRO."
    417     },
    418     {
    419       "title": "Lost in the middle: How language models use long contexts",
    420       "authors": ["Nelson F Liu", "Kevin Lin", "John Hewitt"],
    421       "year": 2024,
    422       "relevance": "Cited to explain why longer interaction histories degrade MAS accuracy, relevant to understanding LLM limitations."
    423     },
    424     {
    425       "title": "Beyond self-talk: a communication-centric survey of LLM-based multi-agent systems",
    426       "authors": ["Bingyu Yan", "Zhibo Zhou", "Litian Zhang"],
    427       "year": 2025,
    428       "arxiv_id": "2502.14321",
    429       "relevance": "Survey of communication strategies in multi-agent LLM systems."
    430     },
    431     {
    432       "title": "Large language model based multi-agents: a survey of progress and challenges",
    433       "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"],
    434       "year": 2024,
    435       "arxiv_id": "2402.01680",
    436       "relevance": "Comprehensive survey of LLM-based multi-agent systems covering progress and challenges."
    437     },
    438     {
    439       "title": "TRAIL: trace reasoning and agentic issue localization",
    440       "authors": ["Darshan Deshpande", "Varun Gangal", "Hersh Mehta"],
    441       "year": 2025,
    442       "arxiv_id": "2505.08638",
    443       "relevance": "Trace-based reasoning and issue localization for agentic systems, directly relevant to MAS observability."
    444     },
    445     {
    446       "title": "AgentArch: a comprehensive benchmark to evaluate agent architectures in enterprise",
    447       "authors": ["Tara Bogavelli", "Roshnee Sharma", "Hari Subramani"],
    448       "year": 2025,
    449       "arxiv_id": "2509.10769",
    450       "relevance": "Benchmark for evaluating agent architectures in enterprise settings."
    451     }
    452   ]
    453 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs