ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28849B)


      1 {
      2   "paper": {
      3     "title": "ART: Adaptive Response Tuning Framework — A Multi-Agent Tournament-Based Approach to LLM Response Optimization",
      4     "authors": ["Omer Jauhar Khan"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2512.00617"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No GitHub link or code repository URL is provided in the paper. The paper mentions a production deployment at aetherapi.co (Section VI.E and VII.C) but this is a SaaS platform, not released source code. No Zenodo archive or downloadable code is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset is released or referenced as publicly available. The experiments use 'diverse query categories' (Section V.A.2) but the actual queries are not provided — only example categories are listed (factual questions, reasoning problems, creative writing, technical explanations, multi-step tasks)."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile specifications, or dependency versions are provided. Section IV.G mentions 'Docker containerization for deployment' but does not provide the actual environment specification or library versions needed to reproduce."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are included. Appendix D provides a code example for a basic tournament but this is a usage example, not instructions for reproducing the paper's experimental results. The actual experimental queries, configurations, and evaluation procedures are not provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables I-IV report only point estimates. Standard deviation is reported for consensus strategies (Table III, e.g., Std Dev 6.3, 5.1, 4.2, 5.8) but no confidence intervals are provided for the main quality improvement results. The 8.4% improvement claim in Table II has no uncertainty measure."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No statistical significance tests are performed. The paper claims 'significant improvements' (abstract) and that consensus strategies outperform baselines, but these comparisons are made by simply comparing numbers without any statistical test (t-test, bootstrap, etc.)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Percentage improvements are reported with baseline context in Table II: accuracy +9.5% (from 72.3 to 79.2), coherence +7.3% (from 75.1 to 80.6), completeness +12.9% (from 68.9 to 77.8), relevance +4.1% (from 80.2 to 83.5), overall +8.4% (from 74.1 to 80.3). These provide enough context for the reader to assess magnitude."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No sample size justification is provided. The number of queries tested, number of tournament runs, or number of repetitions is never stated. The paper does not specify how many queries were used in each category or total."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Standard deviation is reported only for consensus strategy comparison (Table III). No variance or standard deviation is reported for the main quality improvement results across tournament rounds (Table II). It is unclear whether results are from a single run or averaged over multiple runs, and no spread measures are given for the primary claims."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Round 1 scores serve as the single-model baseline, and the paper compares consensus strategies against each other (Table III). Section V.C.3 states 'All consensus strategies outperformed random agent selection.' However, the baseline is the initial round of the same mock agents, not independent single-model performance."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not compare against any contemporary multi-agent response optimization systems. Section VI.D provides qualitative comparisons against Self-Consistency, Debate, Cascades, and Mixture of Experts but no quantitative baseline from any of these approaches is included. The only quantitative baseline is Round 1 scores of the mock agents."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is presented. The framework has multiple components (cross-evaluation, ELO ranking, consensus strategies, response improvement) but there is no systematic evaluation of removing individual components to measure their contribution. The consensus strategy comparison (Table III) is the closest but does not isolate individual components."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used: accuracy, coherence, completeness, and relevance as quality dimensions (Table II), plus system performance metrics (Table IV: latency, throughput) and ELO convergence (Table I, R^2 value)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is included. All quality scores come from the automated cross-evaluation system where mock agents score each other. Section II.B acknowledges 'Human evaluation: Gold standard but expensive and time-consuming' but does not include any. Given that the paper claims to produce 'measurably superior outputs,' human evaluation of response quality would be highly relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No held-out test set is described. The paper does not specify any separation of development and test queries. The query dataset is described only by category (Section V.A.2) with no mention of train/test splitting or held-out evaluation."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No per-category breakdown is provided despite the paper listing five query categories (factual, reasoning, creative, technical, multi-step). All results in Tables I-IV are aggregate across all categories, hiding potential variation in framework effectiveness across query types."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section VI.B (Limitations) discusses failure modes: cross-evaluation perpetuating shared biases (point 2), draw threshold sensitivity (point 3), and cold start problems (point 4). Section VI.C discusses bias amplification. These qualify as discussion of where the approach may break down."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No negative results are reported. Every metric shows improvement across every round (Table II shows monotonic improvement), every consensus strategy outperforms random selection, and ELO convergence is universally successful. No experiments that failed or configurations that underperformed are reported."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims 'significant improvements in response accuracy, coherence, and reliability compared to baseline single-model approaches' and an '8.4% improvement in overall quality metrics.' However, the experiments use mock agents with predetermined quality levels (0.85, 0.75, 0.65) rather than actual LLMs. The 'baseline single-model approaches' are simulations, not real single-model outputs. The abstract also claims 'production-ready solution' but provides no evidence of real production usage or real LLM evaluation."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims that the tournament-based approach 'produces consensus responses that outperform individual model outputs' (abstract) and that 'cross-evaluation helps identify missing content' (Section V.C). These are causal claims but the study design — using mock agents with fixed quality distributions rather than actual LLMs — does not adequately support them. The mock agents generate responses according to preset quality levels, so the improvement may be an artifact of the simulation rather than a property of the tournament approach."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper's title and abstract claim broad applicability ('LLM Response Optimization,' 'production-ready solution for applications requiring high-quality, vetted LLM responses') but experiments use only 3 mock agents with simulated quality levels. No actual LLMs are tested. The gap between the general claims and the mock-agent-only evaluation is substantial."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for its results. The quality improvements could be an artifact of the mock agent simulation design (agents with fixed quality levels generating responses from predetermined distributions). The paper does not consider that the observed improvements might not replicate with real LLMs, or that the scoring mechanism may be tautological (agents scoring each other using the same quality metrics used to define the mock agents)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "No actual model versions are specified. Section V.A.1 describes 3 mock agents 'simulating different model capabilities' with assigned quality levels (0.85, 0.75, 0.65), described as 'representing GPT-4 class,' 'representing GPT-3.5 class,' and 'representing smaller models.' No actual LLM APIs were used, and the code example in Appendix D references 'gpt-4' and 'claude-2' but these are mock agent labels, not actual model deployments."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The experiments use mock agents with predetermined quality levels, not actual LLM prompting. No prompts were sent to real models, so this criterion does not apply to the experimental setup as executed."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Tournament configuration parameters are reported: K-factor = 32.0, 3 rounds per tournament, draw threshold = 5.0, scoring weights (accuracy 0.35, coherence 0.25, completeness 0.25, relevance 0.15). These are detailed in the API specification (Appendix A), Table V, and Section V.A.1. Mock agent quality levels (0.85, 0.75, 0.65) are also specified."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agentic scaffolding is described in detail. Section IV describes the full architecture: agent interface (generate_response, critique_response, improve_response), tournament engine workflow (7 phases detailed in Section IV.C.2), consensus engine strategies (Section IV.E), and agent state management (Section IV.B.2). Figures 4-8 provide detailed architectural diagrams."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No data preprocessing documentation is provided. The query dataset is described only at the category level (Section V.A.2) with example queries but no description of how queries were collected, selected, or prepared for experiments."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VI.B 'Limitations' provides a dedicated discussion of four limitations: computational cost, evaluation quality (shared biases), draw threshold sensitivity, and cold start problem. Section VI.C adds ethical considerations including bias amplification."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section VI.B lists specific threats: 'Cross-evaluation by LLMs may perpetuate shared biases or fail to identify subtle errors when all agents make similar mistakes' (point 2), 'The draw threshold parameter affects ELO dynamics significantly, requiring careful tuning for different domains' (point 3), and 'New agents require 5-10 tournaments to establish accurate ratings' (point 4). These are specific to this system."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that the mock agent experiments cannot demonstrate effectiveness with real LLMs, does not bound claims to the simulated setting, and does not identify specific scenarios or domains where ART would not apply. The limitations discuss system-level concerns but do not bound the generalization of the experimental results."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data is available. The tournament results, query datasets, agent responses, and scoring records are not released for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Section V.A.2 lists query categories (factual, reasoning, creative, technical, multi-step) with examples, but does not describe how many queries were used, how they were created or selected, or what specific queries were tested. The data collection is insufficiently described."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were involved. The study uses mock agents and automated evaluation only."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The processing pipeline from query to response is described architecturally (Section IV, Figure 8), but the experimental data pipeline — how queries were fed into the system, how many experiments were run, how results were aggregated into the summary tables — is not documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information is provided anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author's affiliation is listed: Department of Computer Science, National University of Computer and Emerging Sciences (FAST-NUCES), Peshawar, Pakistan."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The paper promotes aetherapi.co as a production deployment of ART (Sections VI.E and VII.C, with a footnote URL). The author appears to have a commercial interest in the framework's success, but this potential conflict is not disclosed or discussed. The absence of any funding or conflict disclosure while promoting a commercial product is concerning."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present. The paper promotes aetherapi.co as a SaaS platform implementing ART, suggesting potential commercial interest, but no financial interest disclosure is made."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The experiments use mock agents with predetermined quality levels, not actual pre-trained models evaluated on benchmarks. No real LLM capability is being tested, so training cutoff dates are not relevant."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Mock agents are used rather than pre-trained models evaluated on benchmarks. Train/test overlap is not applicable in this experimental design."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation of pre-trained models is conducted. Mock agents with fixed quality distributions are used, so benchmark contamination is not applicable."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table IV reports system performance metrics: average response generation (312 ms), average cross-evaluation (524 ms), average round duration (1.8 s), average tournament duration (5.4 s). Section VI.B.1 notes 'Running multiple agents increases latency (5.4s average per tournament) and API costs proportional to agent count.' While these are latency metrics rather than monetary costs, they are reported. However, since mock agents were used, these reflect simulation costs, not real LLM API costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget is stated. While Table IV gives per-query timing, there is no information about total compute used for the experiments, hardware specifications, or total API spend (noting that mock agents were used, so real API costs are zero, but this is not stated)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ART produces consensus responses that outperform individual model outputs with an 8.4% improvement in overall quality metrics.",
    286       "evidence": "Table II shows overall quality scores improving from 74.1 (Round 1) to 80.3 (Round 3), yielding +8.4% improvement. However, this is measured on mock agents with predetermined quality levels (0.85, 0.75, 0.65), not actual LLMs.",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "ELO ratings demonstrate stable convergence with R² values exceeding 0.96.",
    291       "evidence": "Table I shows agent ratings converging from 1500 to 1612 (Alpha), 1502 (Beta), and 1386 (Gamma) after 10 tournaments. Section V.B.1 states R² > 0.96. However, with predetermined quality levels and simulated responses, convergence is expected by design.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "Hybrid synthesis achieves highest average quality (83.4) among consensus strategies.",
    296       "evidence": "Table III reports consensus strategy comparison: Hybrid Synthesis (83.4 avg quality, 5.8 std dev, 88% best rate) vs Weighted Voting (82.8, 5.1, 85%), Top Response (81.2, 6.3, 78%), Contextual Aggregation (80.5, 4.2, 72%). The comparison is among strategies but uses mock agents.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The framework maintains sub-second round times and supports high concurrency (50+ concurrent tournaments, 100+ req/s).",
    301       "evidence": "Table IV reports performance metrics. However, since mock agents are used rather than actual LLM API calls, these latency numbers do not reflect real-world performance with actual LLM backends.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Cross-evaluation helps identify missing content, as evidenced by the largest gains in completeness (+12.9%).",
    306       "evidence": "Table II shows completeness improving from 68.9 to 77.8 (+12.9%). Section V.C states 'The largest gains in completeness (+12.9%) suggest that cross-evaluation helps identify missing content.' This is a causal interpretation of a mock-agent simulation without evidence that cross-evaluation is the mechanism.",
    307       "supported": "unsupported"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The ART framework uses tournament-style ELO ranking and multi-agent reasoning to optimize LLM responses, claiming an 8.4% quality improvement and R² > 0.96 for ELO convergence. However, all experiments use 3 mock agents with predetermined quality levels (0.85, 0.75, 0.65) rather than actual LLMs, making it impossible to determine whether the framework would improve real LLM outputs. The framework architecture is described in detail with API specifications and consensus strategies, and a commercial SaaS deployment at aetherapi.co is promoted.",
    312   "red_flags": [
    313     {
    314       "flag": "Mock agents instead of real LLMs",
    315       "detail": "The entire experimental evaluation (Section V.A.1) uses 3 mock agents with predetermined quality levels (0.85, 0.75, 0.65) described as 'simulating different model capabilities.' No actual LLM APIs (GPT-4, Claude, etc.) are used in any experiment. The paper's central claim about optimizing LLM outputs is untested on actual LLMs, making the empirical results largely uninformative about real-world effectiveness."
    316     },
    317     {
    318       "flag": "Potentially tautological evaluation",
    319       "detail": "Mock agents with fixed quality distributions are evaluated using the framework's own quality scoring system. The agents' responses are scored on the same dimensions (accuracy, coherence, completeness, relevance) that define their predetermined quality levels. This creates a potential circularity where the evaluation confirms the simulation's built-in assumptions rather than measuring genuine improvement."
    320     },
    321     {
    322       "flag": "Undisclosed commercial interest",
    323       "detail": "The paper promotes aetherapi.co as a production SaaS platform implementing ART (Sections VI.E and VII.C) but includes no competing interests statement or disclosure of the author's commercial relationship with this platform. This is a potential conflict of interest that could bias the presentation of results."
    324     },
    325     {
    326       "flag": "Claims significantly outrun evidence",
    327       "detail": "The abstract claims 'significant improvements in response accuracy, coherence, and reliability compared to baseline single-model approaches' and a 'production-ready solution,' but the evidence comes entirely from simulated mock agents. No statistical significance tests are performed, and 'significant' is used colloquially. The gap between the mock-agent evidence and the broad LLM optimization claims is substantial."
    328     },
    329     {
    330       "flag": "Unspecified sample size",
    331       "detail": "The number of queries used in the experiments is never stated. The paper lists query categories (Section V.A.2) but does not specify how many queries were tested per category or in total, making it impossible to assess the reliability of the reported results."
    332     },
    333     {
    334       "flag": "No negative results",
    335       "detail": "Every metric shows monotonic improvement across all rounds (Table II), every consensus strategy outperforms random selection (Table III), and ELO convergence is universally smooth (Table I). The complete absence of any negative result or failed configuration is suspicious for a complex multi-agent system."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Language models are few-shot learners",
    341       "authors": ["T. Brown", "B. Mann", "N. Ryder", "M. Subbiah"],
    342       "year": 2020,
    343       "relevance": "Foundational GPT-3 paper establishing few-shot capability of large language models, relevant to the LLM capability assessment scope of the survey."
    344     },
    345     {
    346       "title": "Training language models to follow instructions with human feedback",
    347       "authors": ["L. Ouyang", "J. Wu", "X. Jiang", "D. Almeida"],
    348       "year": 2022,
    349       "relevance": "InstructGPT paper on RLHF, relevant to LLM alignment and capability evaluation methodology."
    350     },
    351     {
    352       "title": "Self-consistency improves chain of thought reasoning in language models",
    353       "authors": ["X. Wang", "J. Wei", "D. Schuurmans", "Q. Le", "E. Chi"],
    354       "year": 2023,
    355       "relevance": "Self-consistency voting baseline for multi-agent response aggregation, directly relevant to multi-agent LLM evaluation methodology."
    356     },
    357     {
    358       "title": "Improving factuality and reasoning in language models through multiagent debate",
    359       "authors": ["Y. Du", "S. Li", "A. Torralba", "J. B. Tenenbaum", "I. Mordatch"],
    360       "year": 2023,
    361       "arxiv_id": "2305.14325",
    362       "relevance": "Multi-agent debate framework for improving LLM outputs, directly comparable approach to ART's tournament-based optimization."
    363     },
    364     {
    365       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    366       "authors": ["S. Yao", "D. Yu", "J. Zhao", "I. Shafran", "T. L. Griffiths"],
    367       "year": 2023,
    368       "relevance": "Multi-path reasoning framework for LLMs, relevant to the survey's coverage of LLM capability enhancement techniques."
    369     },
    370     {
    371       "title": "Constitutional ai: Harmlessness from ai feedback",
    372       "authors": ["Y. Bai", "S. Kadavath", "S. Kundu", "A. Askell"],
    373       "year": 2022,
    374       "arxiv_id": "2212.08073",
    375       "relevance": "Anthropic's Constitutional AI approach using self-critique, relevant to AI safety and multi-agent self-evaluation methodology."
    376     },
    377     {
    378       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    379       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    380       "year": 2023,
    381       "arxiv_id": "2305.05176",
    382       "relevance": "LLM cascade approach for cost-effective model routing, directly relevant to multi-model optimization and the cost-practicality dimension of LLM evaluation."
    383     },
    384     {
    385       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    386       "authors": ["L. Zheng", "W.-L. Chiang", "Y. Sheng", "S. Zhuang"],
    387       "year": 2023,
    388       "arxiv_id": "2306.05685",
    389       "relevance": "ELO-based LLM evaluation platform (Chatbot Arena), directly relevant as both a methodological precedent for ELO-based ranking and a benchmark for LLM evaluation methodology."
    390     },
    391     {
    392       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    393       "authors": ["J. Wei", "X. Wang", "D. Schuurmans", "M. Bosma"],
    394       "year": 2022,
    395       "relevance": "Foundational chain-of-thought prompting paper, relevant to LLM capability enhancement techniques in the survey scope."
    396     },
    397     {
    398       "title": "Encouraging divergent thinking in large language models through multi-agent debate",
    399       "authors": ["T. Liang", "Z. He", "W. Jiao", "X. Wang"],
    400       "year": 2023,
    401       "arxiv_id": "2305.19118",
    402       "relevance": "Multi-agent debate framework for LLMs, directly comparable to ART's multi-agent evaluation approach."
    403     }
    404   ]
    405 }

Impressum · Datenschutz