scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29952B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
      6     "authors": [
      7       "Lianmin Zheng",
      8       "Wei-Lin Chiang",
      9       "Ying Sheng",
     10       "Siyuan Zhuang",
     11       "Zhanghao Wu",
     12       "Yonghao Zhuang",
     13       "Zi Lin",
     14       "Zhuohan Li",
     15       "Dacheng Li",
     16       "Eric P. Xing",
     17       "Hao Zhang",
     18       "Joseph E. Gonzalez",
     19       "Ion Stoica"
     20     ],
     21     "year": 2023,
     22     "venue": "Neural Information Processing Systems",
     23     "arxiv_id": "2306.05685",
     24     "doi": null
     25   },
     26   "checklist": {
     27     "claims_and_evidence": {
     28       "abstract_claims_supported": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "All abstract claims are directly supported: GPT-4 achieves >80% agreement with humans (Table 5: 85% S2), biases are documented empirically with quantitative results, and MT-bench/Chatbot Arena are introduced and validated with data.",
     32         "source": "haiku"
     33       },
     34       "causal_claims_justified": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Causal claims about mitigation strategies are tested with controlled ablations: reference-guided grading reduces failure rate from 70% to 15% (Table 4), and position-swapping is systematically tested across multiple prompt variants.",
     38         "source": "haiku"
     39       },
     40       "generalization_bounded": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper asserts 'LLM-as-a-judge is a scalable and explainable way to approximate human preferences' broadly, but only validates this for GPT-4 on 80 questions with 58 graduate-student labelers—the conclusion extends beyond the tested scope.",
     44         "source": "haiku"
     45       },
     46       "alternative_explanations_discussed": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper does not discuss the alternative explanation that GPT-4's high agreement with graduate-student labelers reflects shared RLHF training distributions rather than genuine general human preference alignment; self-enhancement bias is dismissed without controlled alternative hypotheses.",
     50         "source": "haiku"
     51       },
     52       "proxy_outcome_distinction": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper explicitly frames LLM judgments as a proxy for human preferences and validates this proxy by measuring agreement against actual human votes (Tables 5–6), clearly distinguishing the proxy from the construct.",
     56         "source": "haiku"
     57       }
     58     },
     59     "limitations_and_scope": {
     60       "limitations_section_present": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 6 'Discussion' contains a dedicated 'Limitations' paragraph discussing the focus on helpfulness over safety and the single-metric approach to evaluation.",
     64         "source": "haiku"
     65       },
     66       "threats_to_validity_specific": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The limitations paragraph notes safety neglect and single-metric issues but does not address specific threats such as experimenter bias (authors evaluate their own Vicuna model on their own benchmark), or the unrepresentativeness of graduate-student labelers as a gold standard.",
     70         "source": "haiku"
     71       },
     72       "scope_boundaries_stated": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper explicitly states it 'emphasizes helpfulness but largely neglects safety' and defines benchmark scope as 8 categories, 80 questions, and a specific set of models.",
     76         "source": "haiku"
     77       }
     78     },
     79     "conflicts_of_interest": {
     80       "funding_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Funding is disclosed in acknowledgments: gifts from Anyscale, Google, IBM, Intel, Microsoft, MBZUAI, Samsung SDS, Uber, VMware, and a Meta PhD Fellowship for one author.",
     84         "source": "haiku"
     85       },
     86       "affiliations_disclosed": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Author affiliations are disclosed (UC Berkeley, UC San Diego, CMU, Stanford, MBZUAI).",
     90         "source": "haiku"
     91       },
     92       "funder_independent_of_outcome": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "Microsoft and Google are funders and both have direct commercial interests in LLM evaluation outcomes; GPT-4 (Microsoft-backed OpenAI) is evaluated as the primary high-performing judge throughout with no discussion of this conflict.",
     96         "source": "haiku"
     97       },
     98       "financial_interests_declared": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No competing interests statement is present; the acknowledgments list corporate funders but do not declare any author equity, consulting arrangements, or other financial interests.",
    102         "source": "haiku"
    103       }
    104     },
    105     "scope_and_framing": {
    106       "key_terms_defined": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Key terms are defined: 'LLM-as-a-judge' is introduced with three variants (pairwise, single-answer, reference-guided), 'agreement' is formally defined in Appendix D.3, and each bias type (position, verbosity, self-enhancement) is explicitly defined with examples.",
    110         "source": "haiku"
    111       },
    112       "intended_contribution_clear": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Contributions are explicitly stated: '(1) a systematic study of LLM-as-a-judge; and (2) human preference datasets with high-quality questions and diverse user interactions from MT-bench and Chatbot Arena.'",
    116         "source": "haiku"
    117       },
    118       "engagement_with_prior_work": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 2.1 situates the work against three categories of existing benchmarks (core-knowledge, instruction-following, conversational), and concurrent work is cited and compared throughout the main results.",
    122         "source": "haiku"
    123       }
    124     }
    125   },
    126   "type_checklist": {
    127     "empirical": {
    128       "artifacts": {
    129         "code_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Code and data are publicly released at https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge, including 80 MT-bench questions, 3K expert votes, and 30K conversations.",
    133           "source": "haiku"
    134         },
    135         "data_released": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "MT-bench questions, 3K expert votes, and 30K Chatbot Arena conversations with human preferences are publicly released.",
    139           "source": "haiku"
    140         },
    141         "environment_specified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No requirements.txt, Dockerfile, or equivalent environment specification is provided; training mentions '8x A100 GPUs' and libraries (FlashAttention, SkyPilot) but no reproducible environment spec.",
    145           "source": "haiku"
    146         },
    147         "reproduction_instructions": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The paper provides judge prompts in the appendix and a GitHub URL but does not include step-by-step instructions for reproducing the main agreement evaluation results.",
    151           "source": "haiku"
    152         }
    153       },
    154       "statistical_methodology": {
    155         "confidence_intervals_or_error_bars": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "All agreement rates and win rates are reported as point estimates with no confidence intervals or error bars anywhere in the paper.",
    159           "source": "haiku"
    160         },
    161         "significance_tests": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No statistical significance tests are applied to any comparisons; differences between judges and human agreement rates are presented without significance testing.",
    165           "source": "haiku"
    166         },
    167         "effect_sizes_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Effect sizes are reported as percentage differences with baseline context, e.g., reference-guided grading reduces failure rate from 14/20 to 3/20 cases; win rate differences between model pairs are quantified.",
    171           "source": "haiku"
    172         },
    173         "sample_size_justified": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Sample sizes (3K expert votes, 3K arena votes, 23 cases for verbosity test, 10 math questions) are stated but never justified; no power analysis is provided.",
    177           "source": "haiku"
    178         },
    179         "variance_reported": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No variance, standard deviation, or spread is reported for any result; all metrics are single point estimates.",
    183           "source": "haiku"
    184         }
    185       },
    186       "evaluation_design": {
    187         "baselines_included": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Human evaluation serves as the gold standard baseline, and multiple LLM judges (GPT-3.5, Claude-v1, GPT-4) are compared against each other and against human preferences.",
    191           "source": "haiku"
    192         },
    193         "baselines_contemporary": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "All baselines (GPT-3.5, GPT-4, Claude-v1, Vicuna-13B, LLaMA-13B) are contemporary state-of-the-art models at time of writing (2023).",
    197           "source": "haiku"
    198         },
    199         "ablation_study": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Multiple ablations are performed: different prompt variants (default, rename, score, short, CoT, reference-guided), few-shot vs zero-shot, and single vs multi-turn evaluation designs.",
    203           "source": "haiku"
    204         },
    205         "multiple_metrics": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Multiple metrics are used: agreement rates (S1/S2 setups), position-bias consistency rates, win rates, per-category scores, and failure rates under adversarial attacks.",
    209           "source": "haiku"
    210         },
    211         "human_evaluation": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "58 expert (graduate student) labelers evaluated MT-bench questions at ~$35/hr, and crowdsourced Chatbot Arena users provided 30K votes evaluating actual system outputs.",
    215           "source": "haiku"
    216         },
    217         "held_out_test_set": {
    218           "applies": false,
    219           "answer": false,
    220           "justification": "This is an evaluation methodology study, not a prediction task; there is no held-out test set in the standard ML sense.",
    221           "source": "haiku"
    222         },
    223         "per_category_breakdown": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Table 7 provides per-category win rates for all 8 MT-bench categories; Figure 20 shows category-wise scores for 6 models; Table 10 shows position bias by category.",
    227           "source": "haiku"
    228         },
    229         "failure_cases_discussed": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Appendix B/C shows concrete failure cases including position bias examples (Figure 11), verbosity attack failures (Figure 12), math grading errors (Figures 13–15), and multi-turn reference errors (Figure 16).",
    233           "source": "haiku"
    234         },
    235         "negative_results_reported": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Significant negative results are reported: Claude-v1 shows only 23.8% consistency; GPT-4 fails 14/20 math questions with default prompts; CoT still produces wrong judgments due to context contamination by given answers.",
    239           "source": "haiku"
    240         }
    241       },
    242       "setup_transparency": {
    243         "model_versions_specified": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "Main results tables use generic names ('GPT-4', 'GPT-3.5', 'Claude-v1') without snapshot dates; the specific version 'gpt-4-0314' is mentioned only in Appendix B as a caveat about reproducibility.",
    247           "source": "haiku"
    248         },
    249         "prompts_provided": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "All judge prompt templates are fully provided in Appendix A (Figures 5–10), including pairwise, single-answer, CoT, reference-guided, and multi-turn variants with complete text.",
    253           "source": "haiku"
    254         },
    255         "hyperparameters_reported": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "Inference hyperparameters for LLM judges (temperature, top-p) are not reported anywhere; training hyperparameters for Vicuna are reported but that is secondary to the main evaluation.",
    259           "source": "haiku"
    260         },
    261         "scaffolding_described": {
    262           "applies": false,
    263           "answer": false,
    264           "justification": "No agentic scaffolding is used; the evaluation involves direct API calls to LLMs as judges.",
    265           "source": "haiku"
    266         },
    267         "data_preprocessing_documented": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Preprocessing is documented: Vicuna training data was HTML-to-markdown converted with quality filtering (resulting in 125K conversations), and Chatbot Arena data will have PII cleaned and toxic content tagged.",
    271           "source": "haiku"
    272         }
    273       },
    274       "data_integrity": {
    275         "raw_data_available": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Raw data (80 MT-bench questions, 3K expert votes, 30K conversations) is released publicly at the GitHub repository.",
    279           "source": "haiku"
    280         },
    281         "data_collection_described": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Appendix C describes the data collection process for both MT-bench (voting interface, expert labeling procedure, screenshots) and Chatbot Arena (anonymous battle platform, consent process) in detail.",
    285           "source": "haiku"
    286         },
    287         "recruitment_methods_described": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "MT-bench labelers were recruited via application form with consent, paid $20 for 20 questions (~$35/hr), and were mostly graduate students from over ten universities.",
    291           "source": "haiku"
    292         },
    293         "data_pipeline_documented": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Appendix C documents the full pipeline including voting interface screenshots, consent procedures, how GPT-4 judgments are shown to disagreeing humans, and PII handling plans for data release.",
    297           "source": "haiku"
    298         }
    299       },
    300       "contamination": {
    301         "training_cutoff_stated": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "Training data cutoffs for GPT-4, GPT-3.5, and Claude are not stated anywhere in the paper.",
    305           "source": "haiku"
    306         },
    307         "train_test_overlap_discussed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "The paper does not discuss whether MT-bench question types or Chatbot Arena conversation patterns could overlap with the evaluated models' training data.",
    311           "source": "haiku"
    312         },
    313         "benchmark_contamination_addressed": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "MT-bench is described as newly created but no analysis of whether similar open-ended questions appeared in GPT-4's or other models' training data is provided.",
    317           "source": "haiku"
    318         }
    319       },
    320       "human_studies": {
    321         "pre_registered": {
    322           "applies": true,
    323           "answer": false,
    324           "justification": "No pre-registration is mentioned for the human evaluation study.",
    325           "source": "haiku"
    326         },
    327         "irb_or_ethics_approval": {
    328           "applies": true,
    329           "answer": false,
    330           "justification": "No IRB or ethics approval is mentioned; only that participants signed consent forms before participating.",
    331           "source": "haiku"
    332         },
    333         "demographics_reported": {
    334           "applies": true,
    335           "answer": false,
    336           "justification": "Only minimal information provided: 'mostly graduate students from more than ten universities'; no age, gender, field distribution, or other demographics reported.",
    337           "source": "haiku"
    338         },
    339         "inclusion_exclusion_criteria": {
    340           "applies": true,
    341           "answer": false,
    342           "justification": "No formal inclusion/exclusion criteria are stated; the description 'mostly graduate students considered experts' implies informal expertise filtering but no explicit criteria.",
    343           "source": "haiku"
    344         },
    345         "randomization_described": {
    346           "applies": true,
    347           "answer": true,
    348           "justification": "Randomization of question assignment is described: 'let each human evaluate at least 20 random multi-turn questions'; model identity randomization is also implemented.",
    349           "source": "haiku"
    350         },
    351         "blinding_described": {
    352           "applies": true,
    353           "answer": true,
    354           "justification": "Blinding is implemented in both platforms: MT-bench presents 'answers from two random anonymous assistants' and Chatbot Arena discloses model identities only after voting.",
    355           "source": "haiku"
    356         },
    357         "attrition_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No attrition or dropout statistics are reported for the human evaluation participants.",
    361           "source": "haiku"
    362         }
    363       },
    364       "cost_and_practicality": {
    365         "inference_cost_reported": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "The paper notes few-shot prompts are '4× more expensive' for API calls relatively, but absolute inference costs for any evaluation run are not reported.",
    369           "source": "haiku"
    370         },
    371         "compute_budget_stated": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "Vicuna training mentions '8x A100 GPUs' and 'around 2 days' for the longest run, but a total compute budget (GPU-hours, cost) is not stated.",
    375           "source": "haiku"
    376         }
    377       }
    378     }
    379   },
    380   "claims": [
    381     {
    382       "claim": "GPT-4 as a judge achieves over 80% agreement with human expert preferences, matching human-human agreement levels.",
    383       "evidence": "Table 5 shows 85% non-tie agreement between GPT-4 pairwise comparison and human experts on MT-bench; Table 6 shows 87% on Chatbot Arena. Human-human agreement is 81%.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "All LLM judges exhibit significant position bias, with Claude-v1 showing only 23.8% consistency.",
    388       "evidence": "Table 2 shows Claude-v1 at 23.8% consistency with default prompt (75% biased toward first position); GPT-4 is best at 65% consistency—still substantial bias.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "LLM judges are highly susceptible to verbosity attacks, with Claude and GPT-3.5 failing 91.3% of cases while GPT-4 fails only 8.7%.",
    393       "evidence": "Table 3 shows failure rates under 'repetitive list' attack on 23 answers: Claude-v1 91.3%, GPT-3.5 91.3%, GPT-4 8.7%.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Reference-guided grading reduces GPT-4's math grading failure rate from 70% to 15%.",
    398       "evidence": "Table 4 shows failure rates on 10 math questions: default 14/20 (70%), CoT 6/20 (30%), reference-guided 3/20 (15%).",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "LLM judge agreement with humans increases as the performance gap between compared models grows.",
    403       "evidence": "Figure 2 shows agreement rising from ~70% to ~100% as win rate difference between model pairs increases; near-identical models yield lowest agreement.",
    404       "supported": "strong"
    405     },
    406     {
    407       "claim": "GPT-4 shows self-enhancement bias, favoring itself with ~10% higher win rate compared to human judgments.",
    408       "evidence": "Figure 3(b) shows GPT-4 favors itself by ~10% and Claude-v1 by ~25%, but authors acknowledge 'limited data and small differences' prevent definitive conclusions.",
    409       "supported": "moderate"
    410     },
    411     {
    412       "claim": "A fine-tuned Vicuna-13B can approach GPT-4-level judge agreement (85.5% vs 87% non-tie S2).",
    413       "evidence": "Appendix F Table 15 shows fine-tuned Vicuna-13B achieves 85.5% non-tie agreement vs GPT-4's ~87% on the same 3K test set, with preliminary results.",
    414       "supported": "moderate"
    415     }
    416   ],
    417   "methodology_tags": [
    418     "benchmark-eval",
    419     "observational"
    420   ],
    421   "key_findings": "GPT-4 as an LLM judge achieves 85%+ agreement with human expert preferences on MT-bench, matching human-human agreement levels and validating LLM-as-a-judge as a scalable evaluation approach. All tested LLMs exhibit significant position bias (as low as 23.8% consistency for Claude-v1), and Claude/GPT-3.5 fail 91.3% of verbosity attacks while GPT-4 is substantially more robust. Reference-guided grading reduces GPT-4's math/reasoning grading failure rate from 70% to 15%. The publicly released MT-bench, 3K expert votes, and 30K Chatbot Arena conversations established an influential benchmark infrastructure for future LLM evaluation research.",
    422   "red_flags": [
    423     {
    424       "flag": "No statistical tests or CIs",
    425       "detail": "All agreement rates (e.g., 85% GPT-4 vs humans) are reported as point estimates without confidence intervals, significance tests, or variance measures, making reliability assessment impossible."
    426     },
    427     {
    428       "flag": "Authors evaluate own model on own benchmark",
    429       "detail": "The paper evaluates Vicuna (created by the same UC Berkeley team) on MT-bench (also created by the same team), with no discussion of how this circular setup might inflate Vicuna's apparent performance."
    430     },
    431     {
    432       "flag": "Unrepresentative human gold standard",
    433       "detail": "The human 'gold standard' consists of 58 graduate students who are 'considered experts'; GPT-4's high agreement with this group may reflect shared RLHF/academic writing preferences rather than alignment with general human preferences."
    434     },
    435     {
    436       "flag": "Funder conflict with primary positive result",
    437       "detail": "Microsoft and Google are listed funders; GPT-4 (from Microsoft-backed OpenAI) is the paper's primary high-performing judge and is evaluated favorably throughout with no conflict-of-interest discussion."
    438     },
    439     {
    440       "flag": "Self-enhancement bias dismissed without power analysis",
    441       "detail": "The observed 10–25% win rate inflation for self-judging is dismissed with 'limited data and small differences,' but no power analysis is provided; the effect size is practically significant for RLHF pipeline integrity."
    442     },
    443     {
    444       "flag": "Inference hyperparameters unreported",
    445       "detail": "Temperature, top-p, and other inference parameters for GPT-4, GPT-3.5, and Claude as judges are not reported, making results difficult to reproduce exactly."
    446     }
    447   ],
    448   "cited_papers": [
    449     {
    450       "title": "Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback",
    451       "relevance": "Foundational RLHF alignment work that motivates the need for scalable human preference evaluation; directly cited as context for why chat assistants need preference-based benchmarks."
    452     },
    453     {
    454       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    455       "relevance": "Core-knowledge benchmark shown to be inadequate for evaluating chat assistants; central contrast benchmark motivating the need for MT-bench."
    456     },
    457     {
    458       "title": "Holistic Evaluation of Language Models (HELM)",
    459       "relevance": "Comprehensive benchmark framework contrasted with MT-bench's preference-based approach; cited as existing but insufficient for human preference measurement."
    460     },
    461     {
    462       "title": "AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback",
    463       "relevance": "Concurrent work on simulating human feedback evaluation; directly related to the LLM-as-judge paradigm studied in this paper."
    464     },
    465     {
    466       "title": "Can Large Language Models Be an Alternative to Human Evaluations?",
    467       "relevance": "Concurrent work studying the same core question of LLM-as-judge viability, cited as parallel investigation."
    468     },
    469     {
    470       "title": "Large Language Models Are Not Fair Evaluators",
    471       "relevance": "Concurrent work identifying LLM judge position biases, directly related to this paper's bias analysis section."
    472     },
    473     {
    474       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    475       "relevance": "CoT technique applied to the reference-guided judge design to improve math/reasoning evaluation quality."
    476     },
    477     {
    478       "title": "PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization",
    479       "relevance": "Concurrent work on automatic LLM evaluation benchmarking; cited as related to the LLM-as-judge paradigm."
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 3,
    485       "justification": "LLM-as-a-judge is now a standard evaluation approach widely adopted across the industry; Chatbot Arena is a live, publicly accessible platform actively used for model comparison."
    486     },
    487     "surprise_contrarian": {
    488       "score": 2,
    489       "justification": "The finding that GPT-4 matches human evaluators at 80%+ agreement challenged the assumption that human evaluation is irreplaceable for open-ended tasks."
    490     },
    491     "fear_safety": {
    492       "score": 1,
    493       "justification": "Raises modest concerns about evaluation biases (position, verbosity, self-enhancement) that could systematically corrupt RLHF training pipelines if uncorrected."
    494     },
    495     "drama_conflict": {
    496       "score": 1,
    497       "justification": "Position bias findings (Claude only 23.8% consistency) create controversy about reliability of LLM evaluators and implicitly critique widely-used evaluation practices."
    498     },
    499     "demo_ability": {
    500       "score": 3,
    501       "justification": "Chatbot Arena is publicly accessible and widely used; anyone can interact with it immediately and vote on model comparisons."
    502     },
    503     "brand_recognition": {
    504       "score": 3,
    505       "justification": "UC Berkeley LMSYS group, GPT-4, NeurIPS Datasets and Benchmarks track; this became one of the most cited papers establishing LLM evaluation methodology."
    506     }
    507   },
    508   "hn_data": {
    509     "threads": [
    510       {
    511         "hn_id": "35445312",
    512         "title": "Show HN: Want something better than k-means? Try BanditPAM",
    513         "points": 281,
    514         "comments": 41,
    515         "url": "https://news.ycombinator.com/item?id=35445312",
    516         "created_at": "2023-04-04T20:16:33Z"
    517       },
    518       {
    519         "hn_id": "35833868",
    520         "title": "LORA: Low-Rank Adaptation of Large Language Models",
    521         "points": 42,
    522         "comments": 3,
    523         "url": "https://news.ycombinator.com/item?id=35833868",
    524         "created_at": "2023-05-05T19:10:58Z"
    525       },
    526       {
    527         "hn_id": "37531815",
    528         "title": "Mesa-optimization algorithms in Transformers[pdf]",
    529         "points": 23,
    530         "comments": 5,
    531         "url": "https://news.ycombinator.com/item?id=37531815",
    532         "created_at": "2023-09-16T03:35:25Z"
    533       },
    534       {
    535         "hn_id": "37555617",
    536         "title": "Efficiently Correcting Reasoning Failures in Large Language Models",
    537         "points": 5,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=37555617",
    540         "created_at": "2023-09-18T13:10:22Z"
    541       },
    542       {
    543         "hn_id": "41136426",
    544         "title": "Show HN: I built Choosy Chat to get the best answer between GPT, Claude, Gemini",
    545         "points": 4,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=41136426",
    548         "created_at": "2024-08-02T05:52:12Z"
    549       },
    550       {
    551         "hn_id": "38720557",
    552         "title": "ReLoRA: High-Rank Training Through Low-Rank Updates",
    553         "points": 3,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=38720557",
    556         "created_at": "2023-12-21T14:09:38Z"
    557       },
    558       {
    559         "hn_id": "29337457",
    560         "title": "BanditPAM, an improved alternative to k-means",
    561         "points": 3,
    562         "comments": 0,
    563         "url": "https://news.ycombinator.com/item?id=29337457",
    564         "created_at": "2021-11-25T01:55:55Z"
    565       },
    566       {
    567         "hn_id": "38561676",
    568         "title": "AQuaSurF: Discover better activation functions for your ML task",
    569         "points": 2,
    570         "comments": 1,
    571         "url": "https://news.ycombinator.com/item?id=38561676",
    572         "created_at": "2023-12-07T20:53:04Z"
    573       },
    574       {
    575         "hn_id": "40207489",
    576         "title": "Large Language Model for Science: A Study on P vs. NP",
    577         "points": 2,
    578         "comments": 0,
    579         "url": "https://news.ycombinator.com/item?id=40207489",
    580         "created_at": "2024-04-30T05:20:16Z"
    581       },
    582       {
    583         "hn_id": "39934427",
    584         "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    585         "points": 1,
    586         "comments": 0,
    587         "url": "https://news.ycombinator.com/item?id=39934427",
    588         "created_at": "2024-04-04T18:56:42Z"
    589       }
    590     ],
    591     "top_points": 281,
    592     "total_points": 366,
    593     "total_comments": 50
    594   }
    595 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs