scan.json (23616B)
1 { 2 "paper": { 3 "title": "AIME: AI System Optimization via Multiple LLM Evaluators", 4 "authors": [ 5 "Bhrij Patel", 6 "Souradip Chakraborty", 7 "Wesley A. Suttle", 8 "Mengdi Wang", 9 "Amrit Singh Bedi", 10 "Dinesh Manocha" 11 ], 12 "year": 2024, 13 "venue": "Preprint (Under Review)", 14 "arxiv_id": "2410.03131" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "A GitHub repository is provided: https://github.com/Bridge00/aime (footnote on page 3, repeated on page 6)." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper uses publicly available benchmarks: LeetCodeHard (from Shinn et al., 2024) and HumanEval (Chen et al., 2021). These are standard public benchmarks not modified by the authors." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper states experiments were run on 'an Apple M1 Pro and macOS 14.5' (Section 4) but does not provide a requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "No step-by-step reproduction instructions are provided in the paper. A code repository is linked but the paper itself does not contain a 'Reproducing Results' section or detailed instructions." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": true, 43 "justification": "Standard deviations are reported in tables (e.g., Table 1: '89.26 ± 2.10', Table 3) and error bars appear in the figures (Figures 2, 4, 5). Results are reported as mean ± std dev over 3 trials." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper claims AIME outperforms Single-Eval but does not report any statistical significance tests (no p-values, t-tests, or similar). Comparisons are based solely on comparing mean values with overlapping standard deviations." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "Effect sizes are reported in context: 'up to 62% higher error detection rate', 'up to 16% higher success rate', and tables provide baseline and AIME values (e.g., SR from 83.70% to 89.26%), allowing readers to assess magnitude." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper uses only 39 LeetCodeHard problems and the first 20 HumanEval problems with only 3 trials. No justification is given for why these sample sizes are adequate, and no power analysis is discussed." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": true, 63 "justification": "Standard deviations are reported across 3 trials in all tables (Tables 1, 2, 3, 4) and figures show error bars." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "Three baselines are included: Zero-Shot (no optimization), Single-Eval (single evaluator), and Implicit Eval (Reflexion by Shinn et al., 2024). See Section 4.2." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "TextGrad (Yuksekgonul et al., 2024) and Reflexion (Shinn et al., 2024) are both recent and represent state-of-the-art approaches for iterative text-based optimization." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "Section 4.3 provides ablation studies: varying number of evaluators (1→3→6), role diversity (same role vs. distinct roles), role combinations (Table 1), and evaluation temperature (Tables 2-3)." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "Multiple metrics are used: Success Rate (SR), Completion Rate (CR), Error Detection Rate (EDR), and Robustness to Adversarial Evaluator (RAE). See Section 4." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "No human evaluation is included. Evaluation of code quality is entirely automated via test case pass/fail and heuristic error detection phrases. Given claims about evaluation thoroughness and explainability (Figure 3), human evaluation of the generated evaluations would be relevant." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": false, 95 "justification": "The paper does not describe a held-out test set. The same datasets (LeetCodeHard, HumanEval) appear to be used for all experiments and ablations with no clear dev/test split." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down per dataset (LeetCodeHard vs. HumanEval), per role combination (Table 1), per temperature setting (Tables 2, 3), and per number of evaluators (Figure 5)." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "Section 4.3.2 and Table 1 discuss when AIME fails: using only readability/runtime/redundancy roles degrades performance, and is the only setting where Single-Eval outperforms AIME." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper reports that AIME with only readability/runtime/redundancy roles performs worse than Single-Eval (Table 1, Section 4.3.2), and acknowledges the gap between EDR improvement and downstream SR/CR improvement (Section 4.2 Remark)." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims 'up to 62% higher error detection rate' and 'up to 16% higher success rate' are supported by Tables 2 and 3. The claim about 12% impact from evaluator selection is supported by Table 1." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper's causal claims ('multiple evaluators reduce the suboptimality gap', 'increasing evaluators helps') are supported by: (1) theoretical proof (Theorem 1) under stated assumptions, and (2) controlled ablation studies varying the number of evaluators while holding other factors constant (Section 4.3.1)." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The title 'AI System Optimization via Multiple LLM Evaluators' implies general applicability, but results are only on code generation with GPT-4o. The limitations section acknowledges this ('We only empirically study our approach in code generation') but the title and abstract do not bound the claims to code generation." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 4.2 Remark discusses alternative explanations: the feedback LLM may also detect errors independently of the evaluation, and the EDR measurement may undercount Single-Eval's detection due to needing more detection phrases. This acknowledges that the SR/CR gap may not be solely attributable to evaluation quality." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper states 'We use GPT-4o for all LLM calls' (Section 4) but does not specify a version snapshot (e.g., gpt-4o-2024-05-13). GPT-4o behavior changes across versions." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "The evaluation system prompt is provided in Figure 6 (Appendix A.1), including the template and role insertion mechanism. The adversarial prompts are also shown in Figure 6." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Section 4 reports: max output tokens (3600 for Single-Eval, 3600/K for AIME), top_p=0.99, temperature ablated across 0/0.25/0.5/0.75/1, 10 iterations per problem, and other LLM calls use 2000 max tokens with temperature 0." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The scaffolding is described: TextGrad framework with separate evaluation and feedback LLM calls, initial prompt vs. update prompt mechanism, Algorithm 1 specifying the loop structure. See Sections 2 and 3.2." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "The paper states it uses 'the entire LeetCodeHard dataset of 39 problems' and 'the first 20 problems of HumanEval' with no modification. Unit tests are withheld from evaluators. The selection criterion for HumanEval (first 20) is stated." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 6 is titled 'Conclusion, Limitations, and Further Works' and contains a dedicated 'Limitations and Further Work' subsection discussing multiple limitations." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "The limitations section identifies specific threats: single task domain (code generation only), single-agent system (not tested with compound/agentic systems), uniform weighting of evaluators. The remark in Section 4.2 also identifies a specific validity concern about the EDR heuristic." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": true, 176 "justification": "The limitations section explicitly states what was NOT tested: other tasks like molecule optimization or text generation, compound AI systems with multiple agents, non-uniform weighting methods." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "Raw experimental data (individual run results, generated evaluations, generated code) is not made available. Only aggregate statistics (mean ± std over 3 trials) are reported." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "The experimental procedure is described: for each coding problem, generate initial code (same for both methods), run 10 iterations of optimization, record test case pass/fail results. See Section 4." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants in this study. The data sources are standard public benchmarks (LeetCodeHard, HumanEval)." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline is documented: initial code generation → iterative evaluation (single or multiple) → feedback generation → code update → test case evaluation. Algorithm 1 formalizes the process. The EDR heuristic and phrase list are provided in Appendix A.2." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding or acknowledgments section is present in the paper. Co-author Wesley Suttle is affiliated with the U.S. Army Research Laboratory, but no funding disclosure is made." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly listed: University of Maryland, U.S. Army Research Laboratory, Princeton University, University of Central Florida." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding disclosure is itself a concern given that one co-author is at a government research lab." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests or financial interests statement is present in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper uses GPT-4o to generate and evaluate code on HumanEval and LeetCodeHard benchmarks but does not state GPT-4o's training data cutoff date." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "HumanEval (published 2021) and LeetCode problems are widely available online and likely in GPT-4o's training data. The paper does not discuss potential train/test overlap at all." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "HumanEval was published in 2021 and LeetCode problems are publicly available. GPT-4o was almost certainly trained after these were published. The paper does not address contamination risk, which is especially relevant since they compare iterative refinement methods — contamination could differentially affect baseline vs. AIME if the model already 'knows' solutions." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": false, 281 "justification": "AIME uses K separate LLM evaluation calls per iteration (K=6 in the main experiments) vs. 1 for Single-Eval, multiplying API costs. No inference cost, total API spend, or latency comparison is reported despite this being a significant practical consideration." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "The paper mentions hardware ('Apple M1 Pro and macOS 14.5') but does not quantify total computational budget, API costs, or wall-clock time for experiments." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "AIME achieves up to 62% higher error detection rate than single-evaluator approaches on LeetCodeHard.", 293 "evidence": "Table 2 shows EDR at temperature 0: Single-Eval 38.06% vs AIME 91.20% on LeetCodeHard, a difference of ~53 percentage points. The 62% figure appears in the abstract and is supported by the range across temperatures (Section 4.1).", 294 "supported": "moderate" 295 }, 296 { 297 "claim": "AIME-based optimization achieves up to 16% higher success rate than single-evaluator optimization.", 298 "evidence": "Table 3 shows SR improvements across temperatures. The largest gap on LeetCodeHard is ~6.3 percentage points (82.96% vs 89.30% at temp 0.5). On HumanEval the gaps are smaller (~1-3 pp). The '16%' figure in the abstract likely refers to a relative improvement, but even so appears to be an optimistic reading of the data.", 299 "supported": "moderate" 300 }, 301 { 302 "claim": "A linear combination of multiple evaluators can approximate the optimal evaluation policy (Theorem 1).", 303 "evidence": "Theorem 1 in Section 3.1 provides a formal proof under the linear additivity assumption on the aggregation function g. The proof shows the suboptimality gap is bounded by the total variation distance between the optimal evaluator and the mixture.", 304 "supported": "strong" 305 }, 306 { 307 "claim": "The choice of number of evaluators and role combinations impacts success rate by up to 12%.", 308 "evidence": "Table 1 shows that AIME with readability/runtime/redundancy roles achieves SR of 77.41% vs 89.26% with all 6 roles, a difference of ~12 percentage points on LeetCodeHard.", 309 "supported": "strong" 310 }, 311 { 312 "claim": "Increasing evaluator diversity (distinct roles vs. same role) improves performance.", 313 "evidence": "Figure 5 (right) shows that 6 distinct-role evaluators outperform 6 same-role (correctness) evaluators in SR, CR, and EDR on LeetCodeHard.", 314 "supported": "moderate" 315 } 316 ], 317 "methodology_tags": [ 318 "benchmark-eval", 319 "theoretical" 320 ], 321 "key_findings": "AIME proposes using multiple role-specific LLM evaluators instead of a single evaluator for iterative AI system optimization, with theoretical justification that mixture-of-evaluators can reduce the suboptimality gap. On LeetCodeHard and HumanEval code generation benchmarks using GPT-4o, AIME with 6 evaluators achieves substantially higher error detection rates (53-62 pp improvement) and moderately higher success rates (3-7 pp improvement) compared to single-evaluator baselines. Ablation studies reveal that both the number and diversity of evaluator roles matter, though the downstream success rate improvement is much smaller than the error detection improvement.", 322 "red_flags": [ 323 { 324 "flag": "Very small sample size", 325 "detail": "Only 39 LeetCodeHard problems and 20 HumanEval problems with 3 trials. The standard deviations often overlap between AIME and Single-Eval (e.g., SR 83.70±2.28 vs 89.26±2.10), and no significance tests are performed to determine whether differences are statistically meaningful." 326 }, 327 { 328 "flag": "Benchmark contamination not addressed", 329 "detail": "HumanEval (2021) and LeetCode problems are very likely in GPT-4o's training data. The paper does not discuss this risk. If the model already 'knows' solutions, the relative benefit of better evaluation may be confounded." 330 }, 331 { 332 "flag": "No cost analysis despite multiplicative API overhead", 333 "detail": "AIME with K=6 evaluators makes 6x more evaluation API calls per iteration than Single-Eval. The paper does not report any cost comparison, making practical deployment assessment impossible." 334 }, 335 { 336 "flag": "Unspecified model version", 337 "detail": "Only 'GPT-4o' is stated without a version/snapshot date. GPT-4o behavior changes across versions, making reproduction uncertain." 338 }, 339 { 340 "flag": "EDR heuristic is fragile", 341 "detail": "The Error Detection Rate metric relies on keyword matching of specific phrases (Appendix A.2). The authors acknowledge this may undercount Single-Eval's detections (Section 4.2 Remark), potentially inflating AIME's advantage. The 62% gap in EDR translates to only a ~6 pp gap in actual success rate." 342 }, 343 { 344 "flag": "Overstated headline numbers", 345 "detail": "The abstract's '16% higher success rate' is presented as if it is a percentage-point improvement, but the actual SR improvements in Tables 3 and 4 range from ~1-7 percentage points depending on dataset and temperature. The framing is misleading." 346 } 347 ], 348 "cited_papers": [ 349 { 350 "title": "Evaluating large language models trained on code", 351 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 352 "year": 2021, 353 "arxiv_id": "2107.03374", 354 "relevance": "Introduces HumanEval benchmark used in this paper and foundational to LLM code generation evaluation." 355 }, 356 { 357 "title": "Reflexion: Language agents with verbal reinforcement learning", 358 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"], 359 "year": 2024, 360 "relevance": "Baseline method for iterative LLM self-improvement; provides the LeetCodeHard dataset used in experiments." 361 }, 362 { 363 "title": "TextGrad: Automatic 'differentiation' via text", 364 "authors": ["Mert Yuksekgonul", "Federico Bianchi", "Joseph Boen"], 365 "year": 2024, 366 "arxiv_id": "2406.07496", 367 "relevance": "Core framework used to implement AIME; state-of-the-art AI system optimization via text-based gradients." 368 }, 369 { 370 "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", 371 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 372 "year": 2023, 373 "arxiv_id": "2306.05685", 374 "relevance": "Foundational work on LLM-as-a-Judge paradigm, directly relevant to evaluation methodology in LLM systems." 375 }, 376 { 377 "title": "Replacing judges with juries: Evaluating LLM generations with a panel of diverse models", 378 "authors": ["Pat Verga", "Sebastian Hofstatter", "Sophia Althammer"], 379 "year": 2024, 380 "arxiv_id": "2404.18796", 381 "relevance": "Closely related work showing panels of smaller LLM judges can outperform single larger judges, directly relevant to multi-evaluator approaches." 382 }, 383 { 384 "title": "Self-refine: Iterative refinement with self-feedback", 385 "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"], 386 "year": 2024, 387 "relevance": "Pioneering work on iterative text-based feedback loops for LLM refinement, precursor to AI system optimization." 388 }, 389 { 390 "title": "DSPy: Compiling declarative language model calls into state-of-the-art pipelines", 391 "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari"], 392 "year": 2024, 393 "relevance": "Major framework for LLM pipeline optimization, directly relevant to compound AI system design and evaluation." 394 }, 395 { 396 "title": "Critique-out-loud reward models", 397 "authors": ["Zachary Ankner", "Mansheej Paul", "Brandon Cui"], 398 "year": 2024, 399 "arxiv_id": "2408.11791", 400 "relevance": "Uses LLM-generated critiques to augment reward models, related to multi-criteria evaluation approaches." 401 }, 402 { 403 "title": "A survey on evaluating large language models in code generation tasks", 404 "authors": ["Liguo Chen", "Qi Guo", "Hongrui Jia"], 405 "year": 2024, 406 "arxiv_id": "2408.16498", 407 "relevance": "Survey of LLM code generation evaluation methods, directly relevant to methodology quality in this domain." 408 }, 409 { 410 "title": "GPT-4 technical report", 411 "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"], 412 "year": 2023, 413 "arxiv_id": "2303.08774", 414 "relevance": "Technical report for the foundation model family used in this paper's experiments." 415 } 416 ] 417 }