scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26870B)
      1 {
      2   "paper": {
      3     "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate",
      4     "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"],
      5     "year": 2023,
      6     "venue": "International Conference on Machine Learning",
      7     "arxiv_id": "2305.14325",
      8     "doi": "10.48550/arXiv.2305.14325"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Multiagent debate between multiple LLM instances significantly improves reasoning (arithmetic 67→82%, GSM8K 77→85%) and factual accuracy (biographies 66→74%, MMLU 64→71%) over single-agent baselines. Performance scales with both number of agents and debate rounds. Debate can correct errors even when all agents initially give incorrect answers, and agents converge to consensus through iterative critique.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Project website provided at https://composable-models.github.io/llm_debate/ referenced in abstract. This serves as a project page though the paper does not explicitly link a code repository with source."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets (GSM8K, MMLU, BIG-Bench chess validity) and describes how custom datasets (arithmetic, biographies) are constructed. The biographies dataset of 524 computer scientists with ground truth from Wikipedia is introduced but it's unclear if it's released at the project website."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, or dependency details are provided. The paper only mentions using the gpt-3.5-turbo-0301 API."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The appendix gives evaluation details and prompts but no runnable scripts or systematic reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Tables 1 and 2 report ± values for all results (e.g., '81.8 ± 2.3' for arithmetic debate, '73.8 ± 2.3' for biographies debate)."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are reported. Claims of improvement (e.g., 'substantially outperforms') are based solely on comparing point estimates with error bars, with no p-values or formal tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Absolute accuracy improvements are reported with baselines, e.g., arithmetic goes from 67.0% to 81.8%, GSM8K from 77.0% to 85.0% (Tables 1-2), providing context for the magnitude of improvement."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Sample sizes are stated (100 examples for most tasks, 300 for chess) but no justification is given for why these sizes are sufficient. The choice of 100 appears arbitrary."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Tables 1 and 2 report ± values (likely standard error or standard deviation) across results, indicating variance is captured."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Three baselines are compared: single agent, single agent with reflection, and multi-agent majority voting (Section 3.1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include contemporary approaches: zero-shot chain of thought (Kojima et al. 2022), self-reflection (Shinn et al. 2023, Madaan et al. 2023), and self-consistency/majority voting (Wang et al. 2022). These were recent at time of submission."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 3.3 provides multiple ablations: number of agents (Figure 10a), rounds of debate (Figure 10b), debate prompt length (Figure 12), summarization vs concatenation (Figure 13), different initialization prompts, and different language models."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used across tasks: accuracy for arithmetic/GSM8K/MMLU/biographies, pawn score for chess move quality, and move validity for chess validity."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of model outputs is performed. Biography accuracy is evaluated using chatGPT as a judge (Appendix A.2), not human raters. The factuality evaluation is entirely automated."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The approach uses no training, only inference-time debate. Test sets are drawn from established benchmarks (GSM8K, MMLU, BIG-Bench) or freshly generated (arithmetic). No tuning is done on test data."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down across six different tasks (arithmetic, GSM8K, chess move optimality, biographies, MMLU, chess move validity) in Tables 1 and 2 and Figure 1."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figures 21-23 in the appendix show qualitative examples where debate leads to incorrect answers. The Limitations section (Section 5) discusses convergence to incorrect answers and context processing issues."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that reflection hurts performance on MMLU (57.7% vs 63.9% single agent, Table 2), that debates sometimes converge to incorrect answers (Section 5, Figures 21-23), and that longer debates plateau after 4 rounds."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of improved mathematical/strategic reasoning and factual validity are supported by Tables 1-2 showing improvements across all six tasks. The claim that debate reduces hallucinations is supported by the biography experiments."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims like 'multiagent debate improves reasoning' are justified through controlled ablation comparisons — same model, same prompts, varying only the debate mechanism. The ablation studies (Section 3.3) isolate individual factors (number of agents, rounds, prompt type)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract claims the approach has 'potential to significantly advance the capabilities of LLMs' and 'pave the way for further breakthroughs in language generation and understanding,' but experiments are limited to chatGPT (gpt-3.5-turbo-0301) on 6 specific tasks. The cross-model experiment (chatGPT+Bard) uses only 20 problems. The title makes a broad claim about 'Language Models' based on one model."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No substantive discussion of alternative explanations. For instance, whether debate improvements are simply due to more compute/tokens generated (vs. single agent), or whether majority voting with more samples would close the gap. The paper does not control for total token budget."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's claims are well-matched to its measurements: it claims improved accuracy on specific benchmarks and reports accuracy on those benchmarks. It does not overclaim that benchmark accuracy equals general 'reasoning ability' — it consistently uses task-specific metrics."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix A.2 states: 'We run all experiments using the gpt-3.5-turbo-0301 model.' This is a specific API snapshot version."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text is provided in Figure 3 (debate prompts) and Table 15 in the Appendix (starting and debate prompts for all six tasks)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, or other sampling hyperparameters are reported. The paper uses the chatGPT API without stating these settings, which significantly affect output quality and diversity."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The approach is simple prompt-based: generate answers, concatenate responses, prompt for updates. No tools, memory, or retry logic."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix A.2 describes data generation for arithmetic (random integers 0-30), data sources for chess (pgnmentor.com), evaluation procedure for biographies (chatGPT-based validation), and sample sizes for each task."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 'Limitations and Discussion' provides a dedicated limitations discussion covering computational cost, context length issues, and convergence to incorrect answers."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5 discusses specific threats: (1) longer debates cause models to focus only on recent generations due to context limits, (2) debates can converge to incorrect answers because LMs don't correctly express uncertainty, (3) computational expense as a practical limitation."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly bound what the results do NOT show. It does not state limitations regarding model generalization (only tested gpt-3.5-turbo), task scope, or what claims are NOT being made. The limitations focus on known failure modes rather than boundary conditions."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Raw model outputs, debate transcripts, and per-example predictions are not made available. Only aggregate results are reported. The biographies ground truth dataset (524 CS scientists) availability is unclear."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Appendix A.2 describes data collection: arithmetic tasks are randomly generated, GSM8K and MMLU problems are selected from existing datasets, chess games from pgnmentor.com, and biographies ground truth from Wikipedia."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard benchmarks and synthetic generation."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The evaluation pipeline is documented for each task in Appendix A.2, including how answers are extracted, how biography accuracy is evaluated (chatGPT comparison with ground truth), and how chess moves are scored (Stockfish with depth 20)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: MIT CSAIL and Google Brain. The paper evaluates OpenAI's chatGPT and Google's Bard, and one author (Mordatch) is at Google Brain, which is disclosed."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. One author is at Google Brain and the paper evaluates Google's Bard favorably (Bard+chatGPT debate outperforms either alone), creating a potential conflict."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The training data cutoff for gpt-3.5-turbo-0301 is not stated. The model could have been trained on GSM8K, MMLU, and BIG-Bench data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether GSM8K, MMLU, or BIG-Bench examples appeared in the training data of gpt-3.5-turbo. These are all public benchmarks that predate the model."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "GSM8K (2021), MMLU (2020), and BIG-Bench (2022) were all published before gpt-3.5-turbo's training cutoff. No contamination analysis is provided despite high contamination risk."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper acknowledges debate is 'more computationally expensive' (Section 1, Section 5) but provides no quantification of API costs, tokens consumed, or latency. The method requires 3 agents × 2+ rounds of debate per query."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget, API spend, or token counts are reported despite running thousands of API calls across six benchmarks."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis. The ± values in tables may reflect variance across examples rather than across random seeds. The paper does not discuss stochasticity from API sampling."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of runs per experiment is not stated. It is unclear whether results are from single runs or averaged over multiple runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The choice of 3 agents and 2 rounds appears to be the default, with ablations shown, but no search budget for these or other hyperparameters is discussed."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The default configuration of 3 agents and 2 debate rounds is used consistently, with ablation studies (Figures 10, 12, 13) showing performance across different configurations rather than cherry-picking the best."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across six tasks and multiple baselines."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement their own baselines (single agent, reflection, majority voting) without acknowledging potential bias from implementing competitors' methods. No independent evaluation is conducted."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Debate uses 3× more model calls than single agent at minimum, and much more with multiple rounds, but performance is never compared at matched compute budgets. A fairer comparison would be single agent with 6+ samples vs. 3-agent 2-round debate."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the benchmarks actually measure the claimed capabilities. For example, whether MMLU accuracy reflects 'factuality' or memorization, or whether GSM8K captures 'reasoning' vs. pattern matching."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding confound — the debate mechanism IS the thing being tested, and the same model is used across all conditions."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Not discussed. GSM8K (2021), MMLU (2020), and BIG-Bench (2022) all predate gpt-3.5-turbo's training. The model may have memorized solutions."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Not discussed. No analysis of whether the evaluation setup leaks information."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. The 100-example subsets selected from GSM8K and MMLU may not be representative, and selection criteria are not given."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied despite using public benchmarks with a model that likely trained on them."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Multiagent debate significantly improves mathematical reasoning: arithmetic accuracy from 67.0% to 81.8%, GSM8K from 77.0% to 85.0%",
    365       "evidence": "Table 1, Section 3.1. Comparison against single agent, reflection, and majority voting baselines with error bars.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Multiagent debate improves factual accuracy: biographies from 66.0% to 73.8%, MMLU from 63.9% to 71.1%",
    370       "evidence": "Table 2, Section 3.2. Comparison against single agent and reflection baselines.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Debate can correct errors even when all agents initially give incorrect answers",
    375       "evidence": "Figures 4, 5, 11, and qualitative examples in appendix showing all-wrong → correct convergence.",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "Performance improves with more agents and more rounds of debate",
    380       "evidence": "Figure 10a (agents) and 10b (rounds) on arithmetic task showing monotonic improvement.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Debate between different models (chatGPT + Bard) improves performance beyond either model alone (Bard: 11/20, chatGPT: 14/20, joint: 17/20 on GSM8K subset)",
    385       "evidence": "Section 3.3, tested on 20 GSM8K problems only.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Prompts that encourage longer debate lead to better final answers",
    390       "evidence": "Figure 12, comparing short and long debate prompts on arithmetic.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No compute budget comparison",
    397       "detail": "Debate uses 3+ model calls per round for multiple rounds, but is never compared against single-agent with equivalent token budget. Self-consistency with 6 samples might match debate with 3 agents × 2 rounds at lower complexity."
    398     },
    399     {
    400       "flag": "Tiny cross-model experiment",
    401       "detail": "The chatGPT+Bard experiment uses only 20 GSM8K problems — far too small to draw conclusions about cross-model debate, yet is highlighted as a contribution."
    402     },
    403     {
    404       "flag": "No contamination analysis",
    405       "detail": "All benchmarks (GSM8K, MMLU, BIG-Bench) predate gpt-3.5-turbo training. If the model memorized solutions, debate may just help it retrieve them rather than reason. This alternative explanation is never considered."
    406     },
    407     {
    408       "flag": "ChatGPT-as-judge evaluation",
    409       "detail": "Biography factuality is evaluated by prompting chatGPT to compare generated bullets with ground truth. This creates a circularity: the same model family generates and evaluates content. The paper acknowledges this metric may miss incorrect information not in ground truth."
    410     },
    411     {
    412       "flag": "Subsample selection not justified",
    413       "detail": "100 examples are 'selected' from GSM8K, MMLU, and chess validity, but selection criteria are not described. Results could differ on the full benchmarks."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Self-consistency improves chain of thought reasoning in language models",
    419       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed Chi", "Denny Zhou"],
    420       "year": 2022,
    421       "arxiv_id": "2203.11171",
    422       "relevance": "Key baseline approach using majority voting across multiple chain-of-thought samples, directly compared against debate."
    423     },
    424     {
    425       "title": "Chain of thought prompting elicits reasoning in large language models",
    426       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Ed Chi", "Quoc Le", "Denny Zhou"],
    427       "year": 2022,
    428       "arxiv_id": "2201.11903",
    429       "relevance": "Foundational prompting technique for LLM reasoning, combined with debate in this paper."
    430     },
    431     {
    432       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    433       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    434       "year": 2023,
    435       "arxiv_id": "2303.17651",
    436       "relevance": "Single-agent self-reflection baseline directly compared against multiagent debate."
    437     },
    438     {
    439       "title": "Reflexion: an autonomous agent with dynamic memory and self-reflection",
    440       "authors": ["Noah Shinn", "Beck Labash", "Ashwin Gopinath"],
    441       "year": 2023,
    442       "arxiv_id": "2303.11366",
    443       "relevance": "Agentic reflection approach used as a baseline for debate comparison."
    444     },
    445     {
    446       "title": "Large language models are zero-shot reasoners",
    447       "authors": ["Takeshi Kojima", "Shixiang Shane Gu", "Machel Reid", "Yutaka Matsuo", "Yusuke Iwasawa"],
    448       "year": 2022,
    449       "arxiv_id": "2205.11916",
    450       "relevance": "Zero-shot chain-of-thought prompting technique integrated into the debate framework."
    451     },
    452     {
    453       "title": "Training verifiers to solve math word problems",
    454       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
    455       "year": 2021,
    456       "arxiv_id": "2110.14168",
    457       "relevance": "Introduces GSM8K benchmark used in evaluation and verification-based approach to math reasoning."
    458     },
    459     {
    460       "title": "AI safety via debate",
    461       "authors": ["Geoffrey Irving", "Paul Christiano", "Dario Amodei"],
    462       "year": 2018,
    463       "arxiv_id": "1805.00899",
    464       "relevance": "Foundational work on debate as an AI safety mechanism; directly related to using debate for improving LLM outputs."
    465     },
    466     {
    467       "title": "Language models (mostly) know what they know",
    468       "authors": ["Saurav Kadavath", "Tom Conerly", "Amanda Askell"],
    469       "year": 2022,
    470       "arxiv_id": "2207.05221",
    471       "relevance": "Studies LLM self-knowledge and calibration, relevant to debate's mechanism of surfacing uncertainty."
    472     },
    473     {
    474       "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models",
    475       "authors": ["Aarohi Srivastava", "Abhinav Rastogi"],
    476       "year": 2022,
    477       "arxiv_id": "2206.04615",
    478       "relevance": "BIG-Bench benchmark used for chess validity evaluation in this paper."
    479     },
    480     {
    481       "title": "Measuring massive multitask language understanding",
    482       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart", "Andy Zou", "Mantas Mazeika", "Dawn Song", "Jacob Steinhardt"],
    483       "year": 2020,
    484       "arxiv_id": "2009.03300",
    485       "relevance": "MMLU benchmark used for factuality evaluation in this paper."
    486     },
    487     {
    488       "title": "Training language models to follow instructions with human feedback",
    489       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    490       "year": 2022,
    491       "arxiv_id": "2203.02155",
    492       "relevance": "RLHF training that may explain LLM agreeableness in debate, relevant to understanding debate dynamics."
    493     }
    494   ]
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs