scan-v5.json (26176B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Does Prompt Formatting Have Any Impact on LLM Performance?", 6 "authors": [ 7 "Jia He", 8 "Mukund Rungta", 9 "David Koleczek", 10 "Arshdeep Sekhon", 11 "Franklin X Wang", 12 "Sadid Hasan" 13 ], 14 "year": 2024, 15 "venue": "arXiv.org", 16 "arxiv_id": "2411.10541", 17 "doi": null 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "All abstract claims are empirically supported: performance variations documented in Table 1 (up to 54pp on HumanEval); GPT-4's robustness confirmed in Figure 6 via Coefficient of Mean Deviation; significant variations shown via p-values < 0.001.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "Paper uses ablation design: isolates prompt format as sole variable by keeping semantic content identical across formats (Appendix C). Causal claims about format effects are justified by this controlled experimental design.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "Scope appropriately bounded to OpenAI GPT models and 6 specific benchmarks. Section 7 explicitly acknowledges GPT-only focus and plans future work on other LLM families.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "Paper documents that format affects performance but does not discuss why—no exploration of tokenization, training data distribution, or transformer architecture mechanisms. Section D.2 speculates about laziness vs. format processing but doesn't systematically explore alternatives.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "Claims directly match measurements: 'format affects performance' is measured via accuracy (MMLU), pass@1 (HumanEval), BLEU (code translation). No proxy conflation.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Dedicated Section 7 'Limitations' explicitly lists three specific limitations: GPT-only focus, formats excluded (HTML/XML), other prompt engineering dimensions not varied.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Specific threats named: 'GPT-based models' vs. other LLM families, missing format types, other prompt design elements held constant. Not boilerplate disclaimers.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "Scope clearly bounded to 4 formats, 6 benchmarks, 4 GPT models, temperature=0 for consistency. Implicitly states findings don't explain mechanism or provide universal format guidance.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding source statement, no acknowledgments section identifying funder. Affiliations with Microsoft and MIT stated but no explicit funding disclosure.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "Author affiliations clearly listed: Microsoft and MIT for each author.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": true, 88 "justification": "Microsoft affiliation evaluates OpenAI's competing models (not Microsoft's own Copilot), suggesting independence. However, funding source unconfirmed.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement provided. No declaration of patents, equity, or consulting relationships.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "Key metrics formally defined: 'sensitivity' (Section 3.1, with formula), 'consistency' (Section 4.1, with formula), 'transferability' (IoU, Section 5.1). Prompt templates shown in examples (Figure 1, Appendix C).", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Contribution explicitly stated: 'first to compare impact of different prompt formats on GPT models' performance across various tasks.' Three research questions clearly framed (Sensitivity, Consistency, Transferability).", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section A (Related Work) extensively cites prior prompt engineering research. Clearly distinguishes this work: 'Our research diverges...by examining global prompt format modifications' vs. prior work on fine-grained changes. Positions contribution within landscape.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": false, 125 "justification": "No code repository, GitHub link, or code availability statement provided. Experiments depend on proprietary OpenAI API access.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "Uses standard public benchmarks (MMLU, HumanEval, CODEXGLUE, NER Finance, HumanEval-X, FIND). Raw data available via original benchmark sources.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Only mentions 'Azure OpenAI' without Python version, dependency versions, or Docker spec. No requirements.txt or environment file referenced.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step reproduction instructions. Would require Azure OpenAI access, which model versions to call, and how to replicate exact API setup. Prompts partially shown (Appendix C) but not comprehensively.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": true, 151 "justification": "Tables 3-8 report standard deviations (±) for each condition. Figures 4 and 9 display error bars.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": true, 157 "justification": "Table 1 reports one-sided matched pairs t-tests with p-values; all comparisons yield p<0.001 except HumanEval GPT-4-1106 (p=0.055).", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Absolute performance differences shown (Max/Min in Table 1, e.g., 59.7 vs. 50.0). Relative differences noted in text ('40%', '200%', '300%'). Formal effect size metrics (Cohen's d) not reported but magnitude is clear.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "Sample sizes mentioned (MMLU 14,079, HumanEval 164, FIND 500) but not justified. No power analysis or rationale for sufficiency provided.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": true, 175 "justification": "Standard deviations reported in Tables 3-8 and error bars displayed in Figures 4 and 9 across all runs.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Compares 4 format templates (plain text, Markdown, JSON, YAML), with one implicitly serving as baseline. Max/Min comparisons frame within-design baselines.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "Tests current GPT models (GPT-3.5-turbo and GPT-4 variants from 2023-2024). Baselines are contemporary and appropriate for the research question.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "Systematic ablation: varies format while keeping prompt content, persona, instructions, examples identical (Appendix C). Format is isolated as sole experimental variable.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Uses task-specific metrics (accuracy, pass@1, BLEU, NER F1) plus meta-metrics (consistency, IoU). Evaluates from three angles (sensitivity, consistency, transferability).", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": false, 206 "answer": false, 207 "justification": "Not applicable—benchmark evaluation only, no subjective quality assessment needed.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "All benchmarks use standard test/dev splits. MMLU uses official dev set for few-shot, test set for evaluation. HumanEval, CODEXGLUE, etc. use established splits.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Figure 5 breaks MMLU by domain (STEM, humanities, social science, other) and shows per-domain format sensitivity. Less detailed for other benchmarks.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section D.2 explicitly discusses GPT-4-32k's extreme HumanEval failure with JSON (21.95% vs. 76.2% plain text), hypothesizing 'laziness' in chain-of-thought generation.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "Reports no universal optimal format; IoU scores mostly <0.2 for cross-model comparisons, indicating non-transferability. GPT-3.5 consistency <0.5 is noted as negative.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "Exact model IDs provided: 'gpt-35-turbo-0613', 'gpt-35-turbo-16k-0613', 'gpt-4-32k-0613', 'gpt-4-1106-preview'. Date snapshots implicit in version tags.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "Appendix C shows complete prompt templates for NER Finance task with placeholder examples. Only one task shown in detail, but templates and structure fully transparent.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": true, 251 "justification": "Temperature=0 reported for MMLU. For FIND, defers to (Schwettmann et al., 2023) settings without repeating. Top-p, max_tokens not reported.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No agentic scaffolding; straight prompt engineering. Not applicable.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Describes sampling (NER Finance: random 500/500, FIND: random 500/500) and split usage (MMLU dev for few-shot, test for eval). Limited but adequate for standard benchmarks.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": true, 271 "justification": "All data sourced from published benchmarks (MMLU, HumanEval, CODEXGLUE, etc.). Raw benchmark data publicly available.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "Paper does not collect new data; uses existing benchmarks. NA.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human subjects. NA.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "Pipeline is simple: ingest public benchmark → vary prompt format → run on API → collect results. Described adequately for standard benchmarks.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No explicit training data cutoff date stated for GPT-3.5 or GPT-4. ArXiv date 2024-11-15 but model training dates not discussed.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": false, 303 "justification": "No discussion of whether MMLU, HumanEval, CODEXGLUE, or other benchmarks were present in GPT training data. Potential overlap not addressed.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": false, 309 "justification": "No analysis of whether benchmarks entered training data before model cutoff. Standard benchmarks may be over-represented in GPT training.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants. NA.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants. NA.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants. NA.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants. NA.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants. NA.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants. NA.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants. NA.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "No API costs, token counts, or latency metrics reported. Significant cost incurred but not quantified.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "Total computational budget not stated. No count of API calls, tokens consumed, or estimated costs provided.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Prompt formatting significantly affects GPT model performance, with variations ranging up to 54pp (300% relative) depending on task and model", 376 "evidence": "Table 1 documents max/min performance across formats for each benchmark; HumanEval GPT-4-32k shows 21.95% (JSON) vs. 76.2% (plain text); text reports '200% improvement' and '300%' for FIND and HumanEval respectively", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Larger models (GPT-4) are more robust to prompt format changes than smaller models (GPT-3.5)", 381 "evidence": "Figure 6 shows Coefficient of Mean Deviation (CMD) consistently lower for GPT-4 (0.035–0.043) than GPT-3.5 (0.035–0.176); Figure 2 shows consistency scores GPT-3.5 <0.5, GPT-4 >0.5", 382 "supported": "strong" 383 }, 384 { 385 "claim": "No universal optimal format exists even within the same GPT model family", 386 "evidence": "Section 5.2 shows IoU <0.2 for cross-model pairs; GPT-3.5 prefers JSON while GPT-4 prefers Markdown (Section 5.2); conclusion: 'no single format excelling universally'", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Model sensitivity to format is task-agnostic, not contingent on task-specific skill requirements", 391 "evidence": "Figure 5 breaks MMLU by domain (STEM, humanities, social science); shows performance spread exists across all domains. Section D.1: 'Model's sensitivity...is a general characteristic, rather than being contingent on specific skills'", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "GPT-3.5 outputs show low consistency across formats (<0.5 identical responses), while GPT-4 exceeds 0.5", 396 "evidence": "Figure 2 and Figure 8 explicitly show consistency scores for MMLU and FIND datasets. MMLU: GPT-3.5 'displayed low consistency, with scores below 0.5, and only 16% identical responses between Markdown and JSON'", 397 "supported": "strong" 398 }, 399 { 400 "claim": "Model size correlates with increased robustness to prompt format variation", 401 "evidence": "CMD analysis (Figure 6) shows inverse relationship: GPT-4 lower CMD than GPT-3.5; Section D.2 concludes 'larger models are more robust to template variation'", 402 "supported": "strong" 403 } 404 ], 405 "methodology_tags": [ 406 "benchmark-eval", 407 "empirical" 408 ], 409 "key_findings": "Prompt formatting significantly impacts GPT model performance, with variations exceeding 300% relative improvement on some tasks (e.g., HumanEval). GPT-4 demonstrates substantially greater robustness to format changes (lower Coefficient of Mean Deviation, higher consistency scores) compared to GPT-3.5. No single prompt format universally optimizes performance across different GPT models or even within the same family. Format sensitivity is a general model characteristic independent of task-specific skill requirements or domain expertise.", 410 "red_flags": [ 411 { 412 "flag": "No code release", 413 "detail": "Experiments depend entirely on proprietary OpenAI API access via Azure. Cannot be reproduced without credentials and budget." 414 }, 415 { 416 "flag": "Training data contamination not addressed", 417 "detail": "No analysis of whether MMLU, HumanEval, CODEXGLUE, or other benchmarks were present in GPT training data. Standard benchmarks are likely over-represented in LLM training corpora." 418 }, 419 { 420 "flag": "No mechanistic explanation", 421 "detail": "Paper documents that format matters but does not explain why. No investigation of tokenization, embedding distribution, transformer attention patterns, or other mechanisms." 422 }, 423 { 424 "flag": "GPT-only scope limits generalizability", 425 "detail": "Results restricted to OpenAI models; does not test open-source LLMs (LLaMA, Phi), proprietary competitors (Gemini, Claude), or smaller models. Acknowledged limitation but weakens contribution." 426 }, 427 { 428 "flag": "Sample size not justified", 429 "detail": "Benchmark samples mentioned (MMLU 14,079, HumanEval 164) but no power analysis or statistical justification for sufficiency provided." 430 }, 431 { 432 "flag": "Hyperparameter transparency incomplete", 433 "detail": "Temperature reported only for MMLU. Top-p, max_tokens, and other sampling parameters not consistently reported across all experiments." 434 }, 435 { 436 "flag": "Limited mechanistic depth", 437 "detail": "Section D.2 speculates about 'laziness' in GPT-4-32k's JSON output on HumanEval but does not systematically investigate. Alternative explanations (tokenization, prompt structure parsing) unexplored." 438 }, 439 { 440 "flag": "No alternative explanation discussion", 441 "detail": "Does not consider why format sensitivity exists (e.g., training data composition, tokenization artifact, transformer architecture preference) or whether findings might be confounded." 442 } 443 ], 444 "cited_papers": [ 445 { 446 "title": "Quantifying language models' sensitivity to spurious features in prompt design", 447 "relevance": "Directly related—prior work showing LLM sensitivity to fine-grained prompt changes (Sclar et al., 2023); this paper extends to global format variations." 448 }, 449 { 450 "title": "Mind your format: Towards consistent evaluation of in-context learning improvements", 451 "relevance": "Closely related—argues that evaluation standards must marginalize across prompt formats to avoid spurious conclusions (Voronov et al., 2024)." 452 }, 453 { 454 "title": "You don't need a personality test to know these models are unreliable: Assessing the reliability of large language models on psychometric instruments", 455 "relevance": "Related on output consistency and reliability under prompt variation (Shu et al., 2023)." 456 }, 457 { 458 "title": "Evaluating large language models trained on code (HumanEval benchmark)", 459 "relevance": "Benchmark paper defining HumanEval, one of six benchmarks used in this study (Chen et al., 2021)." 460 }, 461 { 462 "title": "Measuring massive multitask language understanding (MMLU)", 463 "relevance": "Benchmark paper defining MMLU, the largest benchmark used in this study (Hendrycks et al., 2020)." 464 }, 465 { 466 "title": "Chain-of-thought prompting elicits reasoning in large language models", 467 "relevance": "Foundational prompting technique cited; this paper tests whether CoT benefits persist across format variations (Wei et al., 2023)." 468 }, 469 { 470 "title": "Language Models are Few-Shot Learners", 471 "relevance": "Foundational in-context learning work; motivates few-shot example ordering as confounding variable this paper controls (Brown et al., 2020)." 472 }, 473 { 474 "title": "Table meets LLM: Can large language models understand structured table data?", 475 "relevance": "Most closely related concurrent work; provides cursory format exploration but limited to tabular data (Sui et al., 2024)." 476 } 477 ], 478 "engagement_factors": { 479 "practical_relevance": { 480 "score": 2, 481 "justification": "Practitioners should care about format choice, but paper provides no guidance on which format to select. Findings are task-model specific without generalizable heuristics." 482 }, 483 "surprise_contrarian": { 484 "score": 2, 485 "justification": "Moderately surprising that semantic content can vary 20–40pp based on formatting alone. Challenges assumption that meaning is content-agnostic, but not radical for practitioners aware of prompt brittleness." 486 }, 487 "fear_safety": { 488 "score": 0, 489 "justification": "No AI safety implications. Paper is purely about engineering optimization for task performance, not safety or alignment." 490 }, 491 "drama_conflict": { 492 "score": 1, 493 "justification": "Minimal drama. Finding that evaluation standards may be invalid if ignoring format has some methodological bite, but not a major controversy or conflict narrative." 494 }, 495 "demo_ability": { 496 "score": 2, 497 "justification": "Reproducible on Azure OpenAI but requires API credentials, budget, and knowledge of how to call proprietary models. Not accessible to casual users without cost." 498 }, 499 "brand_recognition": { 500 "score": 3, 501 "justification": "Strong brands: Microsoft Research authorship, evaluating OpenAI GPT models. Both highly recognized in AI/ML community." 502 } 503 }, 504 "hn_data": { 505 "threads": [ 506 { 507 "hn_id": "42266742", 508 "title": "The Rise and Fall of Ideas' Popularity [pdf]", 509 "points": 3, 510 "comments": 0, 511 "url": "https://news.ycombinator.com/item?id=42266742", 512 "created_at": "2024-11-28T16:54:44Z" 513 }, 514 { 515 "hn_id": "44854721", 516 "title": "Does Prompt Formatting Have Any Impact on LLM Performance?", 517 "points": 2, 518 "comments": 0, 519 "url": "https://news.ycombinator.com/item?id=44854721", 520 "created_at": "2025-08-10T12:23:36Z" 521 }, 522 { 523 "hn_id": "45930419", 524 "title": "A Large-Scale Computational Analysis of Errors in ArXiv Papers", 525 "points": 1, 526 "comments": 0, 527 "url": "https://news.ycombinator.com/item?id=45930419", 528 "created_at": "2025-11-14T18:52:29Z" 529 }, 530 { 531 "hn_id": "33707451", 532 "title": "Knowledge Graph Generation from Text", 533 "points": 1, 534 "comments": 0, 535 "url": "https://news.ycombinator.com/item?id=33707451", 536 "created_at": "2022-11-22T16:21:57Z" 537 } 538 ], 539 "top_points": 3, 540 "total_points": 7, 541 "total_comments": 0 542 } 543 }