ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28455B)


      1 {
      2   "paper": {
      3     "title": "The Six Sigma Agent: Achieving Enterprise-Grade Reliability in LLM Systems Through Consensus-Driven Decomposed Execution",
      4     "authors": ["Khush Patel", "Siva Surendira", "Jithin George", "Shreyas Kapale"],
      5     "year": 2026,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2601.22290",
      8     "doi": "10.48550/arXiv.2601.22290"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["theoretical", "case-study", "benchmark-eval"],
     13   "key_findings": "The paper proposes the Six Sigma Agent architecture combining task decomposition, parallel multi-LLM execution, and consensus voting to achieve enterprise-grade reliability. The theoretical analysis shows that consensus among n agents with error rate p yields system error O(p^⌈n/2⌉). The authors claim 3.4 DPMO (Six Sigma standard) with 13 agents and 80% cost reduction, but the 'evaluation' results appear to derive from theoretical binomial calculations using assumed error rates rather than empirical measurement.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The Reproducibility Statement says 'Enterprise workflow templates and implementation code will be released upon publication.' This is a promise of future release, not a current release."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset is released. The enterprise use case data is not provided, and no benchmark datasets are shared."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency lists, or hardware details are provided anywhere in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The Reproducibility Statement references Appendix D for calculations but no step-by-step reproduction instructions are provided. The paper states code will be released 'upon publication.'"
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No confidence intervals or error bars are reported on any experimental results. Table 1 reports single point estimates for DPMO and error rates."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Appendix D states 'All pairwise comparisons between Six Sigma Agent configurations and baselines are statistically significant (p < 0.001)' but provides no details on which test was used, sample sizes, or how significance was computed. The claim appears to refer to theoretical calculations, not empirical tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports relative improvements with baseline context: '14,700× reliability improvement over single GPT-4o-mini' (from 50,000 DPMO to 3.4 DPMO), '80% cost reduction,' and '45× improvement' from consensus voting. These provide magnitude context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The number of tasks or examples tested in each enterprise use case is never stated. Section 6 reports aggregate DPMO numbers without specifying how many actions were actually executed and evaluated."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. All results are single point estimates."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 1 includes baselines: Single Agent (GPT-4o), Single Agent (GPT-4o-mini), and CoT-SC (n=5, no decomposition)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The baselines are minimal: single-agent execution and basic self-consistency. No comparison to existing multi-agent reliability frameworks, ensemble methods like ICE (which the paper discusses in related work), or other fault-tolerance approaches."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 2 presents an ablation study showing the contribution of each component: atomic decomposition, consensus voting (n=5), and dynamic scaling (n=13)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports multiple metrics: end-to-end accuracy, action-level accuracy, DPMO, cost, latency overhead, and scaling rate."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of system outputs is reported. All evaluation appears automated or theoretical. For enterprise use cases involving financial reconciliation and contract analysis, human evaluation of correctness would be important."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No mention of held-out test sets. The enterprise use cases are described as case studies without clear train/test separation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6.3 provides per-use-case breakdown (FinProcess 8% scaling rate, CustSupport 12%, DocAnalysis 14%) and Section 6.5 provides scaling rate analysis per domain."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.6 discusses error analysis, noting consensus voting fails when tasks are ambiguous or require specialized domain knowledge. Case studies in 6.7 show contested votes and how they were resolved."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "Every configuration shows improvement. No experiments that failed, approaches that didn't work, or scenarios where the architecture performed poorly are reported. Section 7.2 discusses theoretical limitations but no empirical negative results."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims '14,700× reliability improvement' and '3.4 DPMO' from 'Evaluation across three enterprise use cases,' but the evaluation results (Table 1) appear to be theoretical binomial calculations using assumed error rates (5% for GPT-4o-mini, 1% for GPT-4o) rather than empirically measured outcomes. The DPMO values match the theoretical formula exactly."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes strong causal claims ('consensus voting reduces error from 5% to 0.11%', 'dynamic scaling achieves 3.4 DPMO') but the evidence comes from theoretical calculations with assumed parameters, not controlled experiments. The ablation (Table 2) uses the same theoretical framework rather than empirical measurement."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Enterprise-Grade Reliability' and the abstract claims this 'establishes that reliability in AI systems emerges from principled redundancy.' These are sweeping claims based on three illustrative case studies. The paper does not bound its claims to the specific tasks, models, or domains tested."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for its results. For example, the assumed 5% error rate may not hold for different tasks, error correlation may be higher than assumed, or the independence assumption (Assumption 1) may systematically fail in practice. Section 5.2 provides theoretical analysis of correlation but no empirical validation."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures DPMO (defects per million opportunities) but conflates this with 'enterprise-grade reliability.' Real enterprise reliability involves uptime, latency SLAs, data consistency, and more than just action-level correctness. The paper does not discuss the gap between its proxy (action-level error rate) and the broader claim (enterprise reliability)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper mentions 'GPT-4o-mini, Claude Haiku, and Gemini Flash' (Section 6.1.1) without specifying exact versions or snapshot dates for any model."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix B provides the decomposition prompt (B.1) and the voting judge selection prompt (Section 4.4.4) as actual prompt text."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Temperature T=0.7 is specified (Section 4.3.2), similarity threshold τ=0.85 and confidence threshold θ=0.6 (Section 4.4.1), default n=5, dynamic scaling ∆n=4, nmax=13 are all stated."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The architecture is described in detail in Section 4: task decomposition (4.2), micro-agent sampling (4.3), voting judge (4.4), world state manager (4.5), with algorithms (Algorithm 1, 2), figures, and implementation details in Appendix B."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No description of how the enterprise use case data was prepared, what documents were used, or how task correctness was determined. The case studies (Section 6.7) describe individual examples but not the overall data pipeline."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7.2 'Limitations' discusses decomposition quality, systematic errors, open-ended tasks, and scalability."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Section 7.2 limitations are relatively generic. It mentions 'Highly integrated tasks resisting atomic breakdown may not benefit' and 'Assumption 3 may be violated' but does not discuss specific threats like whether the assumed 5% error rate was validated, whether the independence assumption holds empirically, or whether the case studies are representative."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7.1 explicitly lists where the approach is 'Less suitable': creative tasks, hard real-time systems, simple single-step queries, and subjective tasks. This provides clear scope boundaries."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data is available. The execution traces, agent outputs, and enterprise documents are not released."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The enterprise use cases are described at a high level (Section 6.1.1, Appendix C) but no specifics on how many tasks were executed, how ground truth was determined, or what documents were processed."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The study uses LLM-based agents on enterprise tasks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The system architecture is documented but the data pipeline from raw enterprise documents to final DPMO calculations is not. It is unclear how many total actions were executed and how correctness was determined for each."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed. All authors are affiliated with 'Lyzr Research' which appears to be a commercial entity, but no funding statement is provided."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are listed with 'Lyzr Research' affiliation and @lyzr.ai email addresses."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Lyzr appears to be an AI agent platform company. The authors are evaluating an architecture that would be part of their commercial product. The funder (Lyzr) has a direct financial interest in demonstrating agent reliability."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is provided. Authors from a commercial AI agent company evaluating their own reliability architecture should disclose financial interests."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper evaluates a system architecture's reliability on enterprise tasks, not a pre-trained model's knowledge on benchmarks. Contamination in the traditional sense does not apply."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — the paper tests a reliability architecture, not model knowledge on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No standard benchmarks are used. The evaluation is on proprietary enterprise tasks."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "The paper claims '80% cost reduction' by using cheaper models (Section 6.2) and discusses latency overhead of 47% (Section 4.3.5). Section 6.5 notes dynamic scaling adds ~18% cost over baseline n=5."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is stated. The '80% cost reduction' is presented as a relative claim without absolute numbers (e.g., total API spend, tokens consumed)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis is reported. The paper discusses temperature variation (T=0.7) for diversity but does not report results across different random seeds."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is never stated. It is unclear how many total workflows were executed to produce the DPMO numbers in Table 1."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The choice of T=0.7, τ=0.85, θ=0.6 appears untuned or arbitrarily chosen, with no search described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper presents results for specific configurations (n=5, n=9, n=13) without explaining how these were selected or whether other configurations were tried."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No formal statistical tests are performed that would require multiple comparison correction."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own architecture against baselines they implemented. No acknowledgment of self-comparison bias per Lucic et al. (2018)."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The Six Sigma Agent uses 5-13× more API calls than single-agent baselines. While latency (47%) and cost ('80% reduction') are mentioned, there is no systematic performance-vs-compute analysis showing the Pareto frontier."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses three proprietary enterprise use cases without discussing whether they are representative of enterprise workloads or whether DPMO is the right metric for LLM reliability. No comparison with alternative benchmarks."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The architecture IS the scaffold, yet baselines use a different scaffold (single-agent). The improvement could be due to the decomposition, the consensus mechanism, or the multi-model diversity, but the confound between scaffold and model is not systematically addressed beyond the ablation."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The enterprise use cases could involve documents or patterns seen during model training. No discussion of temporal leakage."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the task setup leaks information (e.g., whether the decomposition provides hints that wouldn't be available in real deployment)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the enterprise test cases are independent or share structural similarities that could inflate reliability estimates."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Six Sigma Agent achieves 3.4 DPMO (Six Sigma standard) through dynamic scaling to 13 agents",
    365       "evidence": "Table 1, Section 6.2. The DPMO value matches the theoretical binomial calculation in Appendix D for n=13, p=0.05: Psys = 0.0000034.",
    366       "supported": "weak"
    367     },
    368     {
    369       "claim": "14,700× reliability improvement over single GPT-4o-mini execution",
    370       "evidence": "Section 6.2, Table 1. Calculated as 50,000 DPMO / 3.4 DPMO. Both values appear to be theoretical (50,000 = 5% × 10^6; 3.4 from binomial formula) rather than empirically measured.",
    371       "supported": "weak"
    372     },
    373     {
    374       "claim": "80% cost reduction compared to using expensive reasoning models",
    375       "evidence": "Section 6.2. Claimed based on using GPT-4o-mini (cheaper) with 5-way redundancy vs single GPT-4o. No absolute cost figures provided.",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "Consensus voting with 5 agents reduces error from 5% to 0.11%",
    380       "evidence": "Section 6.2, Equation 16. This is a direct theoretical calculation: Psys(5, 0.05) = 0.00116. The 5% base error rate is assumed, not measured.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Sampling n independent outputs with error rate p achieves system error O(p^⌈n/2⌉)",
    385       "evidence": "Theorem 1 and Corollary 2, Section 5.1. Formal proof provided under stated assumptions (independence, bounded error, error diversity).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Error correlation tolerance is robust: for n=11, p=0.05, ρmax ≈ 0.99",
    390       "evidence": "Corollary 5, Section 5.2. Mathematical derivation provided. However, this assumes a simple binary correlation model that may not capture real-world correlation structure.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Theoretical results presented as empirical evaluation",
    397       "detail": "The 'Experiments' section (Section 6) presents DPMO values that exactly match theoretical binomial calculations (Appendix D). The assumed 5% error rate for GPT-4o-mini and 1% for GPT-4o are not empirically validated. The '14,700× improvement' derives from dividing two theoretical numbers. The paper frames this as 'Evaluation across three enterprise use cases' (abstract) when it appears to be theoretical analysis with illustrative case studies."
    398     },
    399     {
    400       "flag": "Company evaluating its own product",
    401       "detail": "All four authors are from Lyzr Research, an AI agent platform company. They are evaluating an architecture that appears to be their commercial product. No conflict of interest statement, no independent evaluation, and no financial interest disclosure."
    402     },
    403     {
    404       "flag": "Independence assumption likely violated",
    405       "detail": "Assumption 1 (error independence) is critical to all theoretical results but is likely violated in practice. LLMs from different families share training data (Common Crawl, web text), similar RLHF objectives, and may have correlated failure modes on domain-specific tasks. The paper claims ρ ≈ 0.08 for different model families (Section 4.3.4) but provides no empirical evidence for this number."
    406     },
    407     {
    408       "flag": "No sample sizes reported",
    409       "detail": "The number of tasks, workflows, or actions actually executed and evaluated is never stated. For claims of '3.4 DPMO' to be empirically validated, millions of opportunities would need to be observed. The case studies show only 3 individual workflow examples."
    410     },
    411     {
    412       "flag": "Selective baselines",
    413       "detail": "The paper discusses ICE (27% accuracy improvement), Reflexion (91% pass@1), and other sophisticated approaches in related work but does not compare against any of them. Baselines are limited to single-agent execution and basic self-consistency."
    414     },
    415     {
    416       "flag": "Assumed base error rates",
    417       "detail": "The entire evaluation rests on assumed per-action error rates (5% for GPT-4o-mini, 1% for GPT-4o). These are not measured on the actual enterprise tasks. Real error rates could be much higher for complex enterprise tasks, which would invalidate the reliability claims."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Why do multi-agent LLM systems fail?",
    423       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    424       "year": 2025,
    425       "arxiv_id": "2503.13657",
    426       "relevance": "Comprehensive taxonomy of 14 failure modes in multi-agent LLM systems based on 1,600+ annotated execution traces across 7 frameworks."
    427     },
    428     {
    429       "title": "Self-consistency improves chain of thought reasoning in language models",
    430       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    431       "year": 2023,
    432       "relevance": "Foundational work on consensus-based reasoning through sampling multiple reasoning paths, directly inspirational for the Six Sigma approach."
    433     },
    434     {
    435       "title": "Refining LLMs outputs with iterative consensus ensemble (ICE)",
    436       "authors": ["Mahmud Omar", "Benjamin S. Glicksberg", "Girish N. Nadkarni", "Eyal Klang"],
    437       "year": 2025,
    438       "relevance": "Multi-LLM consensus framework achieving 27% accuracy improvement through iterative critique, closely related approach to consensus-based reliability."
    439     },
    440     {
    441       "title": "Detecting hallucinations in large language models using semantic entropy",
    442       "authors": ["Sebastian Farquhar", "Jannik Kossen", "Lorenz Kuhn", "Yarin Gal"],
    443       "year": 2024,
    444       "relevance": "Published in Nature; semantic entropy for hallucination detection, relevant to uncertainty quantification in LLM outputs."
    445     },
    446     {
    447       "title": "How do LLMs fail in agentic scenarios?",
    448       "authors": ["Melissa Z. Pan"],
    449       "year": 2025,
    450       "arxiv_id": "2512.07497",
    451       "relevance": "Identifies four failure archetypes in LLM agents (premature action, over-helpfulness, context pollution, fragile execution)."
    452     },
    453     {
    454       "title": "Reflexion: Language agents with verbal reinforcement learning",
    455       "authors": ["Noah Shinn", "Federico Cassano"],
    456       "year": 2023,
    457       "relevance": "Self-improvement through reflection achieving 91% pass@1 on HumanEval, alternative approach to LLM reliability improvement."
    458     },
    459     {
    460       "title": "Training language models to self-correct via reinforcement learning",
    461       "authors": ["Aviral Kumar", "Vincent Zhuang", "Rishabh Agarwal"],
    462       "year": 2025,
    463       "relevance": "SCoRe framework for self-correction via multi-turn RL, achieving 15.6% gains on MATH, alternative reliability improvement approach."
    464     },
    465     {
    466       "title": "TDAG: A multi-agent framework based on dynamic task decomposition and agent generation",
    467       "authors": ["Yaoxiang Wang", "Zhiyong Wu", "Junfeng Yao", "Jinsong Su"],
    468       "year": 2025,
    469       "relevance": "Dynamic task decomposition with agent generation addressing error propagation in fixed decomposition schemes."
    470     },
    471     {
    472       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    473       "authors": ["Sirui Hong", "Mingchen Zhuge"],
    474       "year": 2024,
    475       "relevance": "Multi-agent framework encoding SOPs into agent coordination, relevant to multi-agent system design."
    476     },
    477     {
    478       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation",
    479       "authors": ["Qingyun Wu", "Gagan Bansal"],
    480       "year": 2023,
    481       "arxiv_id": "2308.08155",
    482       "relevance": "Microsoft's multi-agent conversation framework, one of the key agent frameworks evaluated in failure studies."
    483     },
    484     {
    485       "title": "SeSE: A structural information-guided uncertainty quantification framework for hallucination detection in LLMs",
    486       "authors": ["Xingtao Zhao", "Hao Peng"],
    487       "year": 2025,
    488       "arxiv_id": "2511.16275",
    489       "relevance": "Reports 28% hallucination rates for frontier models, motivating reliability engineering approaches."
    490     },
    491     {
    492       "title": "Pre-Act: Multi-step planning and reasoning improves acting in LLM agents",
    493       "authors": ["Mrinal Rawat", "Ambuje Gupta"],
    494       "year": 2025,
    495       "arxiv_id": "2505.09970",
    496       "relevance": "Planning-before-acting framework achieving +70% Action Recall over ReAct, alternative approach to agent reliability."
    497     }
    498   ]
    499 }

Impressum · Datenschutz