scan.json (18201B)
1 { 2 "paper": { 3 "title": "A Survey of Useful LLM Evaluation", 4 "authors": ["Ji-Lun Peng", "Sijia Cheng", "Egil Diau", "Yung-Yu Shih", "Po-Heng Chen", "Yen-Ting Lin", "Yun-Nung Chen"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2406.00936", 8 "doi": "10.48550/arXiv.2406.00936" 9 }, 10 "scan_version": 2, 11 "active_modules": ["survey_methodology"], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "The paper provides a GitHub link: https://github.com/MiuLab/EvalLLM-Survey (footnote 1 in the abstract)." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "No structured dataset of surveyed papers, extracted metadata, or analysis data is released. The GitHub repo is referenced but no dataset artifact is described." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "No environment specifications are provided. This is a survey paper but could have provided a reproducible analysis environment." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No instructions for reproducing the survey's paper selection, categorization, or analysis are provided." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": false, 38 "answer": false, 39 "justification": "This is a narrative survey that does not perform statistical analysis or experiments." 40 }, 41 "significance_tests": { 42 "applies": false, 43 "answer": false, 44 "justification": "No statistical tests are performed; this is a narrative survey." 45 }, 46 "effect_sizes_reported": { 47 "applies": false, 48 "answer": false, 49 "justification": "No experiments are conducted; this is a narrative survey." 50 }, 51 "sample_size_justified": { 52 "applies": false, 53 "answer": false, 54 "justification": "No experiments or quantitative analysis performed." 55 }, 56 "variance_reported": { 57 "applies": false, 58 "answer": false, 59 "justification": "No experiments or quantitative analysis performed." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper positions itself against prior surveys by Guo et al. (2023) and Chang et al. (2023), noting 'no study has offered a phased framework to explore the usability of LLMs' (Section 1.3)." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "The compared prior surveys (Guo et al. 2023, Chang et al. 2023) are contemporary to this work." 72 }, 73 "ablation_study": { 74 "applies": false, 75 "answer": false, 76 "justification": "This is a survey paper with no system components to ablate." 77 }, 78 "multiple_metrics": { 79 "applies": false, 80 "answer": false, 81 "justification": "No experiments are conducted; this is a narrative survey." 82 }, 83 "human_evaluation": { 84 "applies": false, 85 "answer": false, 86 "justification": "No system outputs to evaluate; this is a survey." 87 }, 88 "held_out_test_set": { 89 "applies": false, 90 "answer": false, 91 "justification": "No experiments conducted." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "The paper organizes findings into a two-stage framework with detailed breakdowns: core ability (reasoning, societal impact, domain knowledge) and agent (planning, application scenarios, benchmark). Figures 2 and 3 provide structured overviews." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper discusses limitations of LLMs in various domains, e.g., ChatGPT's failures on HotpotQA (Section 2.1.4), GPT-3's 78% accuracy being insufficient for real legal work (Section 2.3.2), GPT-4V failing on facial expression recognition (Section 2.3.3)." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "Multiple negative findings reported: LLMs cannot meet ICMJE authorship criteria (Section 2.3.4), pre-trained LLMs not ready for automatic case judgement summarization (Section 2.3.2), ChatGPT falls short on multi-hop reasoning (Section 2.1.4)." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The abstract claims to provide a two-stage framework and discuss evaluation methods in each stage, which the paper delivers through Sections 2 and 3." 114 }, 115 "causal_claims_justified": { 116 "applies": false, 117 "answer": false, 118 "justification": "The paper is a survey that reports findings from other papers; it makes no original causal claims." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": false, 123 "justification": "The title 'A Survey of Useful LLM Evaluation' is broad but the paper's coverage is uneven—heavy on reasoning and domain knowledge, light on many practical evaluation areas. No explicit bounding of scope to what was actually covered." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": false, 127 "answer": false, 128 "justification": "This is a pure survey/taxonomy presenting no original empirical results." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": false, 132 "answer": false, 133 "justification": "This is a survey paper with no original measurements." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": false, 139 "answer": false, 140 "justification": "No models are used in experiments; this is a survey." 141 }, 142 "prompts_provided": { 143 "applies": false, 144 "answer": false, 145 "justification": "No prompting is used; this is a survey." 146 }, 147 "hyperparameters_reported": { 148 "applies": false, 149 "answer": false, 150 "justification": "No experiments conducted." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding used." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": false, 160 "justification": "The paper does not describe its paper selection pipeline—no search queries, databases searched, inclusion/exclusion criteria, or filtering stages are documented. It is unclear how papers were selected for inclusion." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": false, 167 "justification": "There is no dedicated limitations section. The 'Future Directions' section (Section 4) discusses gaps in the field but not limitations of the survey itself." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": false, 172 "justification": "No threats to validity of the survey methodology are discussed." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper does not explicitly state what it excluded or what its coverage boundaries are. The scope is implied by the two-stage framework but never explicitly bounded." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "No structured list of all papers considered, included, or excluded is provided." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": false, 189 "justification": "The paper does not describe how the surveyed papers were collected—no search strategy, databases, or time period specified." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants; data source is published literature (not a standard benchmark requiring NA)." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": false, 199 "justification": "No documentation of the pipeline from initial paper identification to final inclusion in the survey." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding information is disclosed anywhere in the paper." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "All authors are listed as affiliated with National Taiwan University, Taipei, Taiwan." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding is disclosed, so independence cannot be assessed." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is provided." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": false, 227 "answer": false, 228 "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": false, 232 "answer": false, 233 "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": false, 237 "answer": false, 238 "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this survey." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this survey." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this survey." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this survey." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this survey." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this survey." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this survey." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "This is a survey paper with no method to cost." 283 }, 284 "compute_budget_stated": { 285 "applies": false, 286 "answer": false, 287 "justification": "This is a survey paper with no compute requirements." 288 } 289 }, 290 "survey_methodology": { 291 "prisma_or_structured_protocol": { 292 "applies": true, 293 "answer": false, 294 "justification": "No PRISMA diagram, no systematic search protocol, no reproducible search queries, no structured review methodology described. Papers appear to be collected ad-hoc." 295 }, 296 "quality_assessment_of_sources": { 297 "applies": true, 298 "answer": false, 299 "justification": "The survey treats all cited papers equally regardless of their methodological quality. No quality scoring rubric or risk-of-bias assessment is applied to the surveyed papers." 300 }, 301 "publication_bias_discussed": { 302 "applies": true, 303 "answer": false, 304 "justification": "No discussion of publication bias or whether the surveyed papers skew toward positive results about LLM capabilities." 305 } 306 } 307 }, 308 "claims": [ 309 { 310 "claim": "A two-stage framework (core ability → agent) is an effective way to evaluate LLM usability.", 311 "evidence": "The framework is proposed in Section 1.3 and structured through Sections 2 and 3, organizing evaluation around reasoning, societal impact, domain knowledge, planning, and application scenarios.", 312 "supported": "weak" 313 }, 314 { 315 "claim": "GPT-4 achieves human-level logical reasoning in some scenarios.", 316 "evidence": "Section 2.1.1 cites Han et al. (2023) and Liu et al. (2023) finding GPT-4 'qualitatively matches that of humans in some scenarios.'", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "LLMs are not yet ready for fully automatic legal deployment.", 321 "evidence": "Section 2.3.2 cites Deroy et al. (2023) finding 'inconsistent or hallucinated information' in generated legal summaries, and Blair-Stanek et al. (2023) showing GPT-3 reached only 78% on basic statutory reasoning.", 322 "supported": "moderate" 323 }, 324 { 325 "claim": "ChatGPT achieved 75.6% first-pass diagnostic accuracy on clinical vignettes, comparable to physicians' 72.1%.", 326 "evidence": "Section 2.3.4 cites Benoit (2023) using 45 simplified standardized vignettes.", 327 "supported": "moderate" 328 } 329 ], 330 "methodology_tags": ["meta-analysis"], 331 "key_findings": "This survey proposes a two-stage framework for evaluating LLM usability: 'core ability' (reasoning, societal impact, domain knowledge) and 'agent' (planning, application scenarios). It covers evaluation benchmarks and methods across five domains (finance, legislation, psychology, medicine, education) and seven agent application areas. The paper identifies five future directions: dynamic evaluation, LLMs as evaluators, root cause analysis, fine-grained agent evaluation, and robot benchmarks. The survey is broad but lacks systematic methodology—no structured search protocol, no quality assessment of sources.", 332 "red_flags": [ 333 { 334 "flag": "No systematic review protocol", 335 "detail": "The survey provides no description of how papers were selected—no search queries, databases, inclusion/exclusion criteria, or PRISMA flow diagram. This makes the coverage ad-hoc and non-reproducible." 336 }, 337 { 338 "flag": "No quality assessment of sources", 339 "detail": "All surveyed papers are treated equally regardless of methodological rigor. A paper with a single evaluation on 70 records (ChatDB, Section 3.2.3) is presented alongside large-scale benchmark studies without any quality differentiation." 340 }, 341 { 342 "flag": "Uncritical presentation of cited results", 343 "detail": "The survey largely accepts and reports findings from cited papers at face value without scrutinizing methodology, sample sizes, or potential biases in those studies." 344 } 345 ], 346 "cited_papers": [ 347 { 348 "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models", 349 "authors": ["Aarohi Srivastava"], 350 "year": 2022, 351 "arxiv_id": "2206.04615", 352 "relevance": "Major LLM evaluation benchmark (BIG-Bench) used across many capability assessments." 353 }, 354 { 355 "title": "Holistic evaluation of language models", 356 "authors": ["Percy Liang"], 357 "year": 2022, 358 "arxiv_id": "2211.09110", 359 "relevance": "HELM is a comprehensive LLM evaluation framework relevant to evaluation methodology." 360 }, 361 { 362 "title": "Evaluating large language models: A comprehensive survey", 363 "authors": ["Zishan Guo"], 364 "year": 2023, 365 "arxiv_id": "2310.19736", 366 "relevance": "Prior comprehensive LLM evaluation survey, direct comparator to this paper." 367 }, 368 { 369 "title": "A survey on evaluation of large language models", 370 "authors": ["Yupeng Chang"], 371 "year": 2023, 372 "relevance": "Another prior LLM evaluation survey used as a baseline comparison." 373 }, 374 { 375 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 376 "authors": ["Qiusi Zhan"], 377 "year": 2024, 378 "arxiv_id": "2403.02691", 379 "relevance": "Benchmark for evaluating LLM agent vulnerability to indirect prompt injection attacks." 380 }, 381 { 382 "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset", 383 "authors": ["Jiaming Ji"], 384 "year": 2023, 385 "relevance": "Safety alignment dataset for evaluating LLM content safety." 386 }, 387 { 388 "title": "Judging LLM-as-a-judge with MT-bench and chatbot arena", 389 "authors": ["Lianmin Zheng"], 390 "year": 2024, 391 "relevance": "Key evaluation methodology paper on using LLMs as evaluators." 392 }, 393 { 394 "title": "Reflexion: Language agents with verbal reinforcement learning", 395 "authors": ["Noah Shinn"], 396 "year": 2023, 397 "relevance": "Agentic framework for self-reflective LLM agents, evaluated on code generation and reasoning." 398 }, 399 { 400 "title": "WebArena: A realistic web environment for building autonomous agents", 401 "authors": ["Shuyan Zhou"], 402 "year": 2023, 403 "relevance": "Realistic benchmark for evaluating LLM-based web agents." 404 }, 405 { 406 "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs", 407 "authors": ["Yujia Qin"], 408 "year": 2023, 409 "relevance": "Tool learning evaluation framework with ToolEval benchmark for LLM agents." 410 }, 411 { 412 "title": "React: Synergizing reasoning and acting in language models", 413 "authors": ["Shunyu Yao"], 414 "year": 2023, 415 "relevance": "Foundational agentic paradigm combining reasoning and action in LLMs." 416 } 417 ] 418 }