scan.json (24339B)
1 { 2 "paper": { 3 "title": "Assessing and Verifying Task Utility in LLM-Powered Applications", 4 "authors": [ 5 "Negar Arabzadeh", 6 "Siqing Huo", 7 "Nikhil Mehta", 8 "Qingyun Wu", 9 "Chi Wang", 10 "Ahmed Awadallah", 11 "Charles L. A. Clarke", 12 "Julia Kiseleva" 13 ], 14 "year": 2024, 15 "venue": "arXiv", 16 "arxiv_id": "2405.02178" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper states 'we make the data, code and all the logs publicly available at https://bit.ly/3w3yKcS' in the abstract, and references a Git repository in Section 5.1." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper uses publicly available datasets (MATH and ALFWorld) and states all data and logs are publicly available at the provided URL." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. The implementation details (Section 5.1) only mention GPT-4 version and temperature, not the software environment." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper references a Git repository but does not include step-by-step reproduction instructions in the paper itself. The implementation section (5.1) describes the general setup but not specific commands or scripts to run." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper reports 95% confidence intervals on multiple figures (Fig. 3, Fig. 4, Fig. 10, Fig. 11), and explicitly discusses them: 'even with 95% interval confidence on all the success and failed cases' (Section 5.2)." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "Despite making comparative claims about which solutions perform better (e.g., 'Autogen outperforms ReAct and Vanilla GPT-4'), no formal significance tests (p-values, t-tests, etc.) are reported." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper compares solutions primarily through bar charts and distributions but does not report formal effect sizes (Cohen's d, percentage improvements with baselines, etc.). Comparisons are qualitative or visual." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The paper uses 120 math problems and 134 ALFWorld tasks without justifying why these sample sizes are sufficient. No power analysis is provided." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": true, 65 "justification": "The paper reports variance through coefficient of variation analysis across 50 runs with different seeds (Section 6.2.1, Fig. 6), and box plots showing distribution spread (Fig. 5). The coefficient of variation is explicitly computed and discussed." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper evaluates AgentEval on three different solutions: AutoGen, Langchain ReAct, and Vanilla GPT-4 solver (Section 4.1). These serve as comparative baselines for assessing AgentEval's ability to distinguish performance." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "AutoGen (Wu et al., 2023) and ReAct (Yao et al., 2022) are contemporary and widely-used agent frameworks at the time of publication." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper studies the contribution of different components: task-based vs. solution-based criteria (Section 6.1), criteria stability analysis removing unstable criteria (Section 6.2.1), and the VerifierAgent's role in filtering criteria (Section 6.2.3)." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper uses multiple evaluation dimensions: criteria diversity (Section 6.1), criteria stability via coefficient of variation (Section 6.2.1), discriminative power via adversarial testing (Section 6.2.2), and per-criterion quantified performance (Section 5.2)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "The paper explicitly acknowledges in Limitations (Section 8.1): 'the absence of human evaluation in our validation process could be viewed as a drawback.' No human evaluation of AgentEval's outputs is performed, despite the framework being about assessing utility that would benefit from human validation." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "There is no separation of development and test data. The same 120 math problems and 134 ALFWorld tasks are used throughout all experiments. No held-out set is mentioned." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by individual criteria (Clarity, Efficiency, Error Analysis, Completeness) in Fig. 3, by success/failure categories, and by solution type. ALFWorld results are similarly broken down in Fig. 10." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper discusses the 'error analysis' criterion that fails to distinguish success/failure cases (Section 6.2.1, Appendix A.4.2), and discusses criteria that lack discriminative power (Section 6.2.2)." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports that the 'error analysis' criterion shows 'variable performance' and 'does not consistently predict one group (success or failed) to perform better than the other' (Appendix A.4.2). It recommends 'either modify or eliminate this criterion.'" 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims a 'comprehensive analysis of the effectiveness and robustness of AgentEval' for two datasets, which is supported by the experiments in Sections 5-6. The claims are appropriately hedged." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper's causal claims are mostly about the framework's design choices (e.g., solution-based criteria produce more diversity). These are supported by controlled comparisons — e.g., task-based vs. solution-based criteria generation (Section 6.1) and adversarial perturbation experiments (Section 6.2.2) that use controlled manipulations." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims to address 'LLM-Powered Applications' broadly, but the framework is only tested on math problem solving and ALFWorld household tasks. The Limitations section acknowledges testing was 'limited to specific scenarios' but the title and abstract still overgeneralize. The paper says AgentEval can 'gauge the utility of arbitrary LLM-powered agentic applications' based on only two domains." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether GPT-4's known biases as an evaluator might systematically favor certain solution types, or whether the criteria stability results could be due to the model memorizing criteria rather than genuinely generating relevant ones." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Section 5.1 states: 'we use GPT-4 version 0613, accessed through Azure OpenAI services' — this is a specific model version." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Section 5.1 states 'we further provide all the prompts used in our experiments in our Git repository.' The prompts are made available via the code release." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section 5.1 reports 'the temperature of 0' for all experiments. The number of seeds (50) and similarity threshold (τ) values (0.7, 0.85, 1) are also reported." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The three-agent scaffolding (CriticAgent, QuantifierAgent, VerifierAgent) is described in detail in Section 3, with the workflow illustrated in Fig. 1. The roles, inputs, and outputs of each agent are specified. The VerifierAgent algorithm is provided in Algorithm 1." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The paper describes the data selection process: 120 level-5 problems from MATH dataset (20 from each of 6 categories) following Wu et al. (2023), and 134 test cases from ALFWorld (Section 4)." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 8.1 is a dedicated Limitations section that discusses four specific limitations of the work." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 8.1 discusses specific threats: dependence on output log quality, exclusive use of closed-source GPT-4 limiting generalizability, limited test scenarios, and absence of human evaluation in validation." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 3 explicitly states: 'AgentEval is currently focused on tasks where success is clearly defined and multiple successful solutions may exist.' Section 8.1 notes experiments were 'limited to specific scenarios within math problem solving and household tasks.'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The abstract states 'we make the data, code and all the logs publicly available' and provides a URL. The underlying datasets (MATH, ALFWorld) are also publicly available." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 4 describes the MATH dataset (12,500 problems, level-5 subset of 120 problems from 6 categories) and ALFWorld dataset (134-test set) with specific references to their origins and selection criteria." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants are involved. The data sources are standard public benchmarks (MATH, ALFWorld)." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented: CriticAgent generates criteria → QuantifierAgent quantifies each criterion → VerifierAgent validates criteria. The criteria refinement pipeline (50 runs, consolidation, coefficient of variation filtering) is described in Section 6 and Algorithm 1." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding sources are disclosed. The Acknowledgement section thanks individuals for 'inspiring discussions' but does not mention any grants or funding." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed: University of Waterloo, Microsoft Research, Pennsylvania State University, Purdue University. The footnote notes 'Work done during an internship at Microsoft Research.'" 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "Multiple authors are from Microsoft Research, and the paper evaluates AutoGen (a Microsoft product) as one of the solutions, finding it outperforms alternatives. The funder/employer has a stake in the outcome, and this potential conflict is not acknowledged." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interest declaration is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper uses GPT-4 (version 0613) to evaluate solutions on MATH and ALFWorld benchmarks, but does not state GPT-4's training data cutoff. Since GPT-4 is used both as the solution LLM and as the evaluator, contamination is a concern." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether GPT-4 may have seen MATH or ALFWorld problems during training. The MATH dataset was published in 2021, well before GPT-4's training cutoff." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "The MATH dataset (Hendrycks et al., 2021) was published before GPT-4's training, meaning GPT-4 may have been trained on these problems. This contamination risk is not addressed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants are involved in this study. The evaluation is entirely automated using LLM agents." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants are involved." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "The paper mentions trying to 'keep the balance between computational costs and analyzing the robustness' (Section 6.1) but does not report actual API costs, token consumption, or wall-clock time for the evaluation process." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "The paper runs 50 seeds of CriticAgent and QuantifierAgent across 120+ problems using GPT-4, which likely involves substantial API costs. No total computational budget is stated." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "AgentEval can distinguish between successful and failed cases across multiple criteria, even with 95% confidence intervals.", 295 "evidence": "Section 5.2 and Fig. 3 show that for most criteria, successful and failed cases are distinguished with 95% confidence intervals on math problems. Similar results for ALFWorld in Fig. 10.", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "AutoGen outperforms ReAct and Vanilla GPT-4 in terms of accuracy, completeness, and efficiency on math problems.", 300 "evidence": "Section 5.2 and Fig. 3 show quantified criteria values where AutoGen bars are higher. However, no significance tests are provided to confirm these differences.", 301 "supported": "weak" 302 }, 303 { 304 "claim": "Solution-based criteria generation produces more diverse criteria than task-based generation.", 305 "evidence": "Section 6.1 and Fig. 4 show that solution-based approaches (especially AutoGen) yield more unique criteria, though this varies by model creativity. The analysis uses semantic similarity filtering at multiple thresholds.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "The QuantifierAgent is robust, with mean coefficient of variation around or below 0.5 across criteria.", 310 "evidence": "Section 6.2.1 and Fig. 6 show the coefficient of variation stabilizes after approximately 18 seeds. However, whether 0.5 represents acceptable robustness is not formally justified.", 311 "supported": "moderate" 312 }, 313 { 314 "claim": "AgentEval provides a scalable and cost-effective alternative to human evaluations for assessing LLM-powered applications.", 315 "evidence": "The framework is demonstrated on two datasets, but no comparison with actual human evaluation is performed. The Limitations section (8.1) explicitly acknowledges this gap.", 316 "supported": "weak" 317 }, 318 { 319 "claim": "The VerifierAgent can identify and filter out unreliable criteria, improving the overall assessment quality.", 320 "evidence": "Section 6.2.3 describes the VerifierAgent's role, and Algorithm 1 provides the procedure. The 'error analysis' criterion is identified as unstable (Appendix A.4.2). However, no before/after comparison demonstrates quantitative improvement.", 321 "supported": "weak" 322 } 323 ], 324 "methodology_tags": [ 325 "benchmark-eval" 326 ], 327 "key_findings": "AgentEval introduces a three-agent framework (CriticAgent, QuantifierAgent, VerifierAgent) for automatically assessing the utility of LLM-powered applications beyond binary success/failure metrics. The framework demonstrates the ability to distinguish successful from failed cases across multiple criteria with 95% confidence intervals on MATH and ALFWorld benchmarks. The QuantifierAgent shows reasonable robustness (coefficient of variation below 0.5) after approximately 18 seeds, and solution-based criteria generation produces more diverse evaluation criteria than task-based generation. However, the framework lacks human evaluation validation, and the claim of being a 'scalable alternative to human evaluations' is not directly substantiated.", 328 "red_flags": [ 329 { 330 "flag": "Conflict of interest: Microsoft authors evaluating Microsoft product", 331 "detail": "Multiple authors are from Microsoft Research, and the paper evaluates AutoGen (a Microsoft product) favorably against competitors. AutoGen is found to outperform ReAct and Vanilla GPT-4 across criteria, but this conflict is not acknowledged." 332 }, 333 { 334 "flag": "No human evaluation to validate LLM-as-evaluator claims", 335 "detail": "The paper proposes LLMs as alternatives to human evaluation but never validates against actual human judgments. The authors acknowledge this limitation but it fundamentally undermines the central claim." 336 }, 337 { 338 "flag": "Circular evaluation: GPT-4 evaluates GPT-4-based solutions", 339 "detail": "GPT-4 is used both as the underlying model for the solutions being evaluated (AutoGen, Vanilla solver) and as the evaluator (CriticAgent, QuantifierAgent). This circularity could bias results, as GPT-4 may systematically favor its own outputs." 340 }, 341 { 342 "flag": "Benchmark contamination risk unaddressed", 343 "detail": "MATH dataset (2021) was published before GPT-4's training cutoff, meaning the model may have memorized these problems. This is particularly problematic since GPT-4 serves as both solver and evaluator." 344 }, 345 { 346 "flag": "No cost reporting despite heavy API usage", 347 "detail": "The experiments involve 50 runs of CriticAgent, 50 runs of QuantifierAgent on 120+ problems, all using GPT-4. The total API cost is never reported, undermining the 'cost-effective' claim." 348 } 349 ], 350 "cited_papers": [ 351 { 352 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework", 353 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 354 "year": 2023, 355 "arxiv_id": "2308.08155", 356 "relevance": "Core multi-agent framework used for implementing AgentEval and as one of the evaluated solutions." 357 }, 358 { 359 "title": "AgentBench: Evaluating LLMs as Agents", 360 "authors": ["Xiao Liu"], 361 "year": 2023, 362 "arxiv_id": "2308.03688", 363 "relevance": "Benchmark for evaluating LLMs as agents in interactive environments, directly relevant to agentic AI evaluation methodology." 364 }, 365 { 366 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 367 "authors": ["Shunyu Yao"], 368 "year": 2022, 369 "arxiv_id": "2210.03629", 370 "relevance": "Key agent framework used as a baseline solution in the paper's evaluation." 371 }, 372 { 373 "title": "ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate", 374 "authors": ["Chi-Min Chan"], 375 "year": 2023, 376 "relevance": "Multi-agent debate framework for evaluation, directly relevant to LLM-as-judge methodology." 377 }, 378 { 379 "title": "Can Large Language Models Be Trusted for Evaluation? Scalable Meta-Evaluation of LLMs as Evaluators via Agent Debate", 380 "authors": ["Steffi Chern"], 381 "year": 2024, 382 "arxiv_id": "2401.16788", 383 "relevance": "Studies trustworthiness of LLMs as evaluators through agent debate, directly relevant to evaluator reliability." 384 }, 385 { 386 "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework", 387 "authors": ["Sirui Hong"], 388 "year": 2023, 389 "arxiv_id": "2308.00352", 390 "relevance": "Multi-agent collaboration framework relevant to agentic AI architecture." 391 }, 392 { 393 "title": "Large Language Models Cannot Self-Correct Reasoning Yet", 394 "authors": ["Jie Huang"], 395 "year": 2023, 396 "relevance": "Identifies weaknesses in single LLMs as evaluators, motivating multi-agent evaluation approaches." 397 }, 398 { 399 "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Scale Language Model Society", 400 "authors": ["Guohao Li"], 401 "year": 2023, 402 "arxiv_id": "2303.17760", 403 "relevance": "Multi-agent framework for exploring LLM collaboration, relevant to agentic AI architectures." 404 }, 405 { 406 "title": "Holistic Evaluation of Language Models", 407 "authors": ["Percy Liang"], 408 "year": 2023, 409 "relevance": "Comprehensive LLM evaluation framework directly relevant to evaluation methodology quality." 410 }, 411 { 412 "title": "A Survey on Evaluation of Large Language Models", 413 "authors": ["Yupeng Chang"], 414 "year": 2023, 415 "relevance": "Survey of LLM evaluation methods, relevant to understanding the evaluation landscape." 416 }, 417 { 418 "title": "Autonomous Evaluation and Refinement of Digital Agents", 419 "authors": ["Jiayi Pan"], 420 "year": 2024, 421 "arxiv_id": "2404.06474", 422 "relevance": "Autonomous agent evaluation and refinement, closely related to the AgentEval framework's goals." 423 }, 424 { 425 "title": "Aligning Offline Metrics and Human Judgments of Value for Code Generation Models", 426 "authors": ["Victor Dibia"], 427 "year": 2023, 428 "relevance": "Studies alignment between automated metrics and human judgments for LLM code generation, relevant to evaluation methodology." 429 } 430 ] 431 }