scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20153B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Lessons from the Trenches on Reproducible Evaluation of Language Models",
      6     "authors": [
      7       "Stella Biderman",
      8       "Hailey Schoelkopf",
      9       "Lintang Sutawika",
     10       "Leo Gao",
     11       "Jonathan Tow",
     12       "Baber Abbasi",
     13       "Alham Fikri Aji",
     14       "Pawan Sasanka Ammanamanchi",
     15       "Sidney Black",
     16       "Jordan Clive",
     17       "Anthony DiPofi",
     18       "Julen Etxaniz",
     19       "Benjamin Fattori",
     20       "Jessica Zosa Forde",
     21       "Charles Foster",
     22       "Jeffrey Hsu",
     23       "Mimansa Jaiswal",
     24       "Wilson Y. Lee",
     25       "Haonan Li",
     26       "Charles Lovering",
     27       "Niklas Muennighoff",
     28       "Ellie Pavlick",
     29       "Jason Phang",
     30       "Aviya Skowron",
     31       "Samson Tan",
     32       "Xiangru Tang",
     33       "Kevin A. Wang",
     34       "Genta Indra Winata",
     35       "François Yvon",
     36       "Andy Zou"
     37     ],
     38     "year": 2024,
     39     "venue": "arXiv.org",
     40     "arxiv_id": "2405.14782",
     41     "doi": "10.48550/arXiv.2405.14782"
     42   },
     43   "checklist": {
     44     "claims_and_evidence": {
     45       "abstract_claims_supported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "All four abstract claims — challenges in LM evaluation, three years of experience informing guidance, best practices, and the lm-eval library — are substantiated by Sections 2–4 and the case studies.",
     49         "source": "haiku"
     50       },
     51       "causal_claims_justified": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper's main causal claim — that prompt format causes score differences — is demonstrated empirically in Table 1, where switching ARC from cloze to MMLU-style yields a 22pp swing for Mistral-7B (50.1% vs. 72.4%).",
     55         "source": "haiku"
     56       },
     57       "generalization_bounded": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Broad assertions such as 'most works on language modeling do not perform statistical significance testing' are stated without citation to systematic survey evidence, exceeding the demonstrated scope.",
     61         "source": "haiku"
     62       },
     63       "alternative_explanations_discussed": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly discusses tradeoffs between loglikelihood vs. generative evaluation, and between multiple normalization schemes, presenting pros and cons of each approach rather than a single interpretation.",
     67         "source": "haiku"
     68       },
     69       "proxy_outcome_distinction": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 2.2 explicitly states 'we do not care about the actual numeric score of a model on a benchmark' and discusses validity as the correlation between benchmark scores and real-world phenomena.",
     73         "source": "haiku"
     74       }
     75     },
     76     "limitations_and_scope": {
     77       "limitations_section_present": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "There is no dedicated limitations or threats-to-validity section; the paper ends with a brief conclusion with no formal acknowledgment of what lm-eval does not address.",
     81         "source": "haiku"
     82       },
     83       "threats_to_validity_specific": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Construct validity is acknowledged as 'an ongoing problem' but explicitly deferred; no specific threats to the paper's own prescriptive claims are enumerated.",
     87         "source": "haiku"
     88       },
     89       "scope_boundaries_stated": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 2.2 explicitly states: 'While validity is an ongoing problem in language model evaluation, we focus on mitigating other concerns first' — a clear and specific scope boundary.",
     93         "source": "haiku"
     94       }
     95     },
     96     "conflicts_of_interest": {
     97       "funding_disclosed": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No funding sources are disclosed anywhere in the provided paper text.",
    101         "source": "haiku"
    102       },
    103       "affiliations_disclosed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Author affiliations are listed on the title page (EleutherAI, Stability AI, MBZUAI, Brown University, NYU, CMU, etc.).",
    107         "source": "haiku"
    108       },
    109       "funder_independent_of_outcome": {
    110         "applies": false,
    111         "answer": false,
    112         "justification": "No funding is disclosed, so this criterion is not applicable; however, the primary authors are from EleutherAI and are presenting their own lm-eval tool without disclosing this institutional self-interest.",
    113         "source": "haiku"
    114       },
    115       "financial_interests_declared": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "No competing interests or financial interests statement appears in the paper.",
    119         "source": "haiku"
    120       }
    121     },
    122     "scope_and_framing": {
    123       "key_terms_defined": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "'The Key Problem' is formally defined (semantic equivalence challenge), 'validity' is cited to Messick (1994), and loglikelihood, perplexity, and normalization approaches are formally defined in Appendix A.",
    127         "source": "haiku"
    128       },
    129       "intended_contribution_clear": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 1 explicitly lists three contributions: (1) overview of common evaluation challenges, (2) best practices, (3) the lm-eval library for reproducible benchmarking.",
    133         "source": "haiku"
    134       },
    135       "engagement_with_prior_work": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper engages substantively with HELM, BIG-Bench, PromptSource, and individual benchmarks (MMLU, ARC, HellaSwag), situating lm-eval's 'orchestration-first' philosophy against prescriptive alternatives.",
    139         "source": "haiku"
    140       }
    141     }
    142   },
    143   "type_checklist": {
    144     "benchmark-creation": {
    145       "construct_design": {
    146         "construct_validity_argued": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "The paper explicitly defers construct validity: 'While validity is an ongoing problem, we focus on mitigating other concerns first.' lm-eval is a framework wrapping existing benchmarks, not a new benchmark with a validity argument.",
    150           "source": "haiku"
    151         },
    152         "difficulty_distribution_characterized": {
    153           "applies": false,
    154           "answer": false,
    155           "justification": "lm-eval is an evaluation framework, not a new benchmark; no new benchmark items are created, making difficulty distribution characterization not applicable.",
    156           "source": "haiku"
    157         },
    158         "ceiling_floor_effects_checked": {
    159           "applies": false,
    160           "answer": false,
    161           "justification": "No new benchmark items are introduced; the paper runs existing benchmarks through a standardized framework, so ceiling/floor checks on newly created items are not applicable.",
    162           "source": "haiku"
    163         },
    164         "human_baseline_included": {
    165           "applies": false,
    166           "answer": false,
    167           "justification": "The paper does not create a new benchmark requiring human performance comparison; all reported results are model-vs-model comparisons on existing benchmarks.",
    168           "source": "haiku"
    169         },
    170         "scoring_rubric_justified": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Appendix A provides extensive formal justification for all scoring approaches (token-length normalization, byte-length normalization, mutual information), explaining when each is appropriate and their tradeoffs.",
    174           "source": "haiku"
    175         }
    176       },
    177       "robustness": {
    178         "contamination_resistance_designed": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "Data contamination is not discussed as a design consideration for lm-eval; no temporal splits, canary strings, or anti-gaming measures are described in the framework's design.",
    182           "source": "haiku"
    183         },
    184         "temporal_robustness_discussed": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Section 2.4 and Figure 1 explicitly address how benchmarks become outdated as paradigms shift (fine-tuning → in-context learning → chat), noting that most benchmarks predate current evaluation paradigms.",
    188           "source": "haiku"
    189         },
    190         "failure_modes_discussed": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section 2 comprehensively covers failure modes of LM evaluation: the Key Problem, prompt sensitivity, implementation divergence, API deprecation rendering work irreproducible, and benchmark-paradigm mismatch.",
    194           "source": "haiku"
    195         },
    196         "baseline_implementations_provided": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "lm-eval is open source with YAML configurations and Python task implementations; Appendix B shows the exact configs used for ARC cloze, ARC MMLU-style, MMLU original, and MMLU hybrid variants.",
    200           "source": "haiku"
    201         }
    202       },
    203       "documentation": {
    204         "dataset_documentation_complete": {
    205           "applies": true,
    206           "answer": false,
    207           "justification": "While YAML configs are shown, there is no data card, systematic collection methodology, or preprocessing documentation for the benchmarks the framework wraps.",
    208           "source": "haiku"
    209         },
    210         "licensing_and_access_clear": {
    211           "applies": true,
    212           "answer": false,
    213           "justification": "The paper states lm-eval is open source but does not specify its license; no explicit licensing terms appear in the paper text.",
    214           "source": "haiku"
    215         },
    216         "intended_use_specified": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Section 4 explicitly states lm-eval's purpose is orchestration and reproducibility, and Section 2.2 specifies what it does NOT do (prescribe which benchmarks or address construct validity).",
    220           "source": "haiku"
    221         }
    222       }
    223     }
    224   },
    225   "claims": [
    226     {
    227       "claim": "LLM benchmark scores are highly sensitive to prompt format, with the same model on the same benchmark varying by 22+ percentage points depending on prompting style.",
    228       "evidence": "Table 1 shows Mistral-7B on ARC scores 50.1% (cloze) vs. 72.4% (MMLU-style); for MMLU, MMLU-style vs. Hybrid yields 58.6% vs. 48.3%.",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "Copying evaluation numbers across papers is misleading because differing implementation details make comparisons invalid.",
    233       "evidence": "Table 1 demonstrates this empirically; Marie et al. (2021) meta-evaluation of 769 MT papers is cited as systematic evidence for the field-wide problem.",
    234       "supported": "strong"
    235     },
    236     {
    237       "claim": "Most LM evaluation papers do not perform statistical significance testing, undermining confidence in reported results.",
    238       "evidence": "Asserted as a best practice motivation but no systematic survey data or citation supports the 'most works' quantification.",
    239       "supported": "weak"
    240     },
    241     {
    242       "claim": "lm-eval's standardized implementations give researchers confidence that results from prior work using the library are reproducible.",
    243       "evidence": "Table 2 shows lm-eval adoption across 12 novel architecture papers; the Open LLM Leaderboard uses lm-eval as its backend.",
    244       "supported": "moderate"
    245     },
    246     {
    247       "claim": "Most widely-used LM benchmarks were designed before current evaluation paradigms (in-context learning, chat) and are being used for purposes they were not designed for.",
    248       "evidence": "Figure 1 shows benchmark release dates all predate BERT/GPT-2/GPT-3/ChatGPT shifts; paper states 'common practice diverges from the method described in the paper for all listed tasks except MMLU and MATH.'",
    249       "supported": "strong"
    250     }
    251   ],
    252   "methodology_tags": [
    253     "benchmark-eval",
    254     "case-study",
    255     "theoretical"
    256   ],
    257   "key_findings": "The paper's central empirical finding is that prompt format alone causes 20+ percentage point swings in benchmark scores (e.g., Mistral-7B on ARC: 50.1% cloze vs. 72.4% MMLU-style), rendering cross-paper comparisons unreliable when implementation details differ. The root cause is identified as the 'Key Problem': no reliable automatic method exists to judge semantic equivalence of LM outputs, forcing brittle workarounds (multiple choice, regex matching) that are hypersensitive to formatting. The lm-eval library addresses this through standardized open-source task implementations with version tracking, supporting three core request types (loglikelihood, perplexity, generation) and having been adopted by 12+ novel architecture papers and the HuggingFace Open LLM Leaderboard. Construct validity — whether benchmarks actually measure real-world capabilities — is explicitly out of scope.",
    258   "red_flags": [
    259     {
    260       "flag": "Self-evaluation bias undisclosed",
    261       "detail": "The primary authors are from EleutherAI and are presenting their own lm-eval tool as the solution to evaluation problems; this institutional self-interest is not disclosed as a potential conflict."
    262     },
    263     {
    264       "flag": "No limitations section",
    265       "detail": "The paper makes broad prescriptive claims about evaluation best practices with no formal limitations section acknowledging where those practices may not apply."
    266     },
    267     {
    268       "flag": "Unsupported generalization",
    269       "detail": "'Most works on language modeling do not perform statistical significance testing' is asserted without systematic survey evidence."
    270     },
    271     {
    272       "flag": "Construct validity explicitly deferred",
    273       "detail": "The fundamental question of whether benchmarks measure meaningful capabilities is acknowledged as 'an ongoing problem' then set aside, leaving the paper's prescriptions incomplete."
    274     },
    275     {
    276       "flag": "Benchmark-creation misclassification",
    277       "detail": "This is primarily a position/tool paper about an evaluation framework; lm-eval wraps existing benchmarks rather than creating new ones, making several benchmark-creation criteria structurally inapplicable."
    278     }
    279   ],
    280   "cited_papers": [
    281     {
    282       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    283       "relevance": "Central case study demonstrating how scoring style (MMLU-style vs. hybrid) causes large score differences on the same benchmark."
    284     },
    285     {
    286       "title": "Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge",
    287       "relevance": "Primary case study showing cloze vs. MMLU-style prompting yields 22pp swings, demonstrating the reproducibility problem."
    288     },
    289     {
    290       "title": "Scientific credibility of machine translation research: A meta-evaluation of 769 papers (Marie et al., 2021)",
    291       "relevance": "Provides systematic field-wide evidence for harm from copying results and lack of significance testing, supporting the paper's prescriptions."
    292     },
    293     {
    294       "title": "Holistic Evaluation of Language Models (HELM)",
    295       "relevance": "Related unified evaluation library; lm-eval's orchestration-first, non-prescriptive approach is explicitly contrasted with HELM."
    296     },
    297     {
    298       "title": "Beyond the Imitation Game (BIG-Bench)",
    299       "relevance": "Related prescriptive benchmark suite; lm-eval is positioned as complementary by focusing on infrastructure rather than mandating specific benchmarks."
    300     },
    301     {
    302       "title": "Quantifying language models' sensitivity to spurious features in prompt design (Sclar et al., 2023)",
    303       "relevance": "Supporting evidence that prompt formatting changes significantly affect LM performance, corroborating the paper's central claim."
    304     },
    305     {
    306       "title": "The benchmark lottery (Dehghani et al., 2021)",
    307       "relevance": "Provides evidence that benchmark choice skews comparisons and influences research directions, motivating the need for lm-eval's standardized infrastructure."
    308     },
    309     {
    310       "title": "A framework for few-shot language model evaluation (Gao et al., 2021)",
    311       "relevance": "The original lm-eval v1 release that this paper formally introduces and describes the evolution of."
    312     }
    313   ],
    314   "engagement_factors": {
    315     "practical_relevance": {
    316       "score": 3,
    317       "justification": "lm-eval is a widely-used open-source tool directly applicable to anyone evaluating LLMs; best practices are immediately actionable for any evaluation project."
    318     },
    319     "surprise_contrarian": {
    320       "score": 2,
    321       "justification": "Demonstrating that identical models on identical benchmarks can show 22pp swings from prompt format alone challenges confidence in all published leaderboard numbers."
    322     },
    323     "fear_safety": {
    324       "score": 1,
    325       "justification": "The paper briefly notes that poor evaluation can lead to 'adverse effects from deploying suboptimal or harmful models' but safety implications are not the primary focus."
    326     },
    327     "drama_conflict": {
    328       "score": 1,
    329       "justification": "Implicitly criticizes industrial labs for not sharing evaluation code and for deprecated APIs making hundreds of studies irreproducible, but the tone is constructive rather than confrontational."
    330     },
    331     "demo_ability": {
    332       "score": 3,
    333       "justification": "lm-eval is installable via pip and runnable immediately; Appendix B provides YAML configs anyone can copy and reproduce the paper's ARC/MMLU comparisons."
    334     },
    335     "brand_recognition": {
    336       "score": 2,
    337       "justification": "EleutherAI is well-known in the open LLM community; lm-eval backs the HuggingFace Open LLM Leaderboard, giving it high visibility."
    338     }
    339   },
    340   "hn_data": {
    341     "threads": [
    342       {
    343         "hn_id": "41536003",
    344         "title": "The Legend of Holy Sword: An Immersive Experience for Concentration Enhancement",
    345         "points": 139,
    346         "comments": 66,
    347         "url": "https://news.ycombinator.com/item?id=41536003",
    348         "created_at": "2024-09-13T23:07:24Z"
    349       },
    350       {
    351         "hn_id": "39144845",
    352         "title": "Tweets to Citations: The Impact of Social Media Influencers on AI Research",
    353         "points": 67,
    354         "comments": 47,
    355         "url": "https://news.ycombinator.com/item?id=39144845",
    356         "created_at": "2024-01-26T16:57:45Z"
    357       },
    358       {
    359         "hn_id": "40474294",
    360         "title": "Lessons from the trenches on reproducible evaluation of language models",
    361         "points": 42,
    362         "comments": 3,
    363         "url": "https://news.ycombinator.com/item?id=40474294",
    364         "created_at": "2024-05-25T11:42:23Z"
    365       },
    366       {
    367         "hn_id": "42590687",
    368         "title": "European Space Agency Benchmark for Anomaly Detection in Satellite Telemetry",
    369         "points": 3,
    370         "comments": 0,
    371         "url": "https://news.ycombinator.com/item?id=42590687",
    372         "created_at": "2025-01-03T23:37:19Z"
    373       },
    374       {
    375         "hn_id": "41049219",
    376         "title": "On the Design and Analysis of LLM-Based Algorithms",
    377         "points": 2,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=41049219",
    380         "created_at": "2024-07-23T18:26:56Z"
    381       },
    382       {
    383         "hn_id": "40467826",
    384         "title": "Lessons from the Trenches on Reproducible Evaluation of Language Models",
    385         "points": 1,
    386         "comments": 0,
    387         "url": "https://news.ycombinator.com/item?id=40467826",
    388         "created_at": "2024-05-24T16:35:27Z"
    389       },
    390       {
    391         "hn_id": "41230495",
    392         "title": "Residual Quantization with Implicit Neural Codebooks",
    393         "points": 1,
    394         "comments": 0,
    395         "url": "https://news.ycombinator.com/item?id=41230495",
    396         "created_at": "2024-08-12T23:23:30Z"
    397       }
    398     ],
    399     "top_points": 139,
    400     "total_points": 255,
    401     "total_comments": 116
    402   }
    403 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs