scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24935B)
      1 {
      2   "paper": {
      3     "title": "Does Prompt Formatting Have Any Impact on LLM Performance?",
      4     "authors": ["Jia He", "Mukund Rungta", "David Koleczek", "Arshdeep Sekhon", "Franklin X Wang", "Sadid Hasan"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2411.10541"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Prompt formatting (plain text, Markdown, JSON, YAML) significantly affects GPT model performance, with up to 40% variation on code translation tasks for GPT-3.5-turbo. Larger models (GPT-4) are more robust to format changes but still show significant sensitivity. No single format is universally optimal, and format preferences do not transfer across model generations (e.g., GPT-3.5 prefers JSON while GPT-4 prefers Markdown on MMLU).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or mention of code release found anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The study uses publicly available benchmarks: MMLU, HumanEval, NER Finance (OpenAI Evals), FIND, CODEXGLUE, and HumanEval-X. All are standard public datasets."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements files, or dependency details provided. Only mentions Azure OpenAI as the platform."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions, README, or scripts provided."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Tables 3-8 in Appendix E report results with ± notation (e.g., '59.705 ± 16.594' in Table 3)."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "One-sided matched pairs t-tests are used throughout (Section 3.1), with p-values reported in Table 1. Almost all p < 0.01."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports absolute performance differences with baseline context (e.g., Table 1 shows max and min scores for each model/dataset, and Section 3.2 reports percentage improvements like '200% improvement' and '300% boost'). The raw scores provide sufficient context."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for sample sizes. For NER Finance, '500 examples randomly sampled' with no justification. MMLU uses full test set (14,079) and HumanEval uses all 164 samples, but no power analysis or justification is given."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Standard deviations are reported in Tables 3-8 (e.g., '59.705 ± 16.594' in Table 3 for MMLU). These appear to be across examples rather than across runs, but the ± notation is consistently provided."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The study's design compares four prompt formats against each other, with each serving as a baseline for the others. The comparison is the point of the paper."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The prompt formats compared (plain text, Markdown, JSON, YAML) are all contemporary and widely used formats. Prior work (Sclar et al. 2023, Voronov et al. 2024) is cited as context."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "The study varies a single factor (prompt format) across conditions. There is no multi-component system to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper uses multiple metrics: accuracy (MMLU), pass@1 (HumanEval), F1-like entity matching (NER Finance), BLEU (CODEXGLUE, HumanEval-X), string indicator metric (FIND), plus consistency (Section 4) and IoU transferability (Section 5) as meta-metrics."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "The study evaluates prompt format sensitivity via automated benchmarks. Human evaluation of model outputs is not relevant to the claims about format sensitivity."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Standard benchmark test sets are used: MMLU dev set for few-shot examples and test set (14,079 questions) for evaluation (Appendix B). HumanEval, CODEXGLUE, etc. all have established test splits."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per dataset (7 benchmarks), per model (4 models), and per format (4 formats). Figure 5 provides per-domain MMLU breakdown (humanities, social science, STEM, others)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section D.2 discusses the GPT-4-32k-0613 failure on HumanEval with JSON format (21.95% vs 76.22% plain text), analyzing that the model generated chain-of-thought in plain text but failed to generate code."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that no single format is universally optimal, which is itself a negative result about format transferability. The GPT-4-32k JSON failure on HumanEval is a notable negative result."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims of 'up to 40% variation in code translation' and 'GPT-4 more robust' are supported by Table 1 (CODEXGLUE Java2CS: 78.4 vs 66.5 for GPT-35-turbo) and Figure 6 showing lower CMD for GPT-4 models."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper claims prompt format 'impacts' performance. The controlled experimental design holds content constant while varying only format, which is adequate for this causal claim. Statistical significance is tested via matched pairs t-test."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title asks about 'LLM Performance' generally but the study only tests OpenAI GPT models. The Limitations section (Section 7) acknowledges this but the title and abstract make broad claims about 'LLMs' while testing only one model family."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not discuss alternative explanations for why formats affect performance. Section D.2 hypothesizes about the GPT-4-32k JSON failure being related to 'laziness' but does not discuss confounds like tokenization differences across formats, training data format distributions, or whether the observed differences are artifacts of specific API versions."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper directly measures benchmark scores (accuracy, pass@1, BLEU) and claims these show format sensitivity. The claims match the granularity of measurements — no proxy gap exists."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 2.3 specifies exact model versions: 'gpt-35-turbo-0613', 'gpt-35-turbo-16k-0613', 'gpt-4-32k-0613', and 'gpt-4-1106-preview'."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix C (Table 2) provides the four prompt templates with their structure. The NER Finance examples in the appendix show actual prompt text including persona, instructions, output format, and placeholders. The fill values for the examples are described as coming from the datasets."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Temperature is mentioned only for MMLU consistency experiment ('set the temperature to zero', Section 4.2). No systematic reporting of temperature, top-p, max tokens, or other sampling parameters for the main experiments across all benchmarks."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The study sends prompts directly to GPT models via Azure OpenAI API."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix B describes dataset usage: MMLU dev set for few-shot examples and test set for evaluation, NER Finance 500 random samples, HumanEval all 164 samples, FIND strings category with 500 functions and 5 example pairs, CODEXGLUE 1000 test samples. The data selection and preparation is documented."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 7 'Limitations' is a dedicated section discussing three specific limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 identifies specific limitations: (1) only GPT models tested, not LLaMA/Gemini/PaLM/Phi; (2) missing HTML/XML formats; (3) no interaction with other prompt engineering techniques like few-shot count variation."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 explicitly states what was NOT tested: other model families, other format types (HTML, XML), and interaction with other prompt engineering techniques. Section 2.3 also explains why GPT models specifically were chosen."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw model outputs, per-example predictions, or response logs are released. Only aggregate statistics are provided."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix B describes each dataset's source, size, and evaluation metric. The benchmarks used are all well-documented public datasets."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. All data comes from standard public benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from benchmark data to formatted prompts to model outputs to scores is not documented step by step. How prompts were constructed programmatically, how API calls were made, and how responses were parsed is not described."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section found in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: five authors from Microsoft, one from MIT. Microsoft is clearly identified."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Microsoft (employer of 5/6 authors) is a major investor in OpenAI and resells GPT models via Azure. The paper evaluates GPT models accessed through Azure OpenAI. Microsoft has a financial interest in GPT model adoption, making the funder non-independent."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement found. Microsoft's investment in OpenAI and Azure OpenAI revenue from GPT model access are undisclosed financial interests relevant to a paper evaluating GPT models."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No mention of training data cutoff dates for any of the GPT models used. The paper tests models on benchmarks like HumanEval (2021) and MMLU (2020) without stating when model training data ends."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the benchmark problems appeared in GPT model training data. HumanEval and MMLU were published years before the models' training cutoffs."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "MMLU (2020), HumanEval (2021), and CODEXGLUE (2021) were all published well before GPT-3.5 and GPT-4 training. No discussion of contamination risk despite this being a study of benchmark performance."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No mention of API costs, token consumption, or inference time despite running 4 models × 4 formats × 6 benchmarks through Azure OpenAI."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No mention of total API spend, compute time, or resources used for the experiments."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No mention of multiple random seeds. Section 4.2 mentions setting temperature to zero for MMLU consistency experiments, but no multi-seed analysis is reported for any benchmark."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs per condition is not stated. It appears results are from single runs, but this is never made explicit."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": false,
    304         "answer": false,
    305         "justification": "No hyperparameter tuning is involved. The study compares fixed prompt formats on existing benchmarks using API models."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "No configuration selection is involved. All four formats are evaluated and reported."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper performs 28 t-tests (7 benchmarks × 4 models) in Table 1 alone, plus additional comparisons. No correction for multiple comparisons (Bonferroni, etc.) is applied."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The paper does not propose a new system — it compares existing prompt formats on existing models. No self-comparison bias is present."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "All conditions use the same compute (single API call per example). Compute differences across conditions are negligible."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper uses 6 benchmarks without discussing whether they adequately capture the construct of 'LLM performance' or whether format sensitivity on these specific benchmarks generalizes to real-world usage."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is used. Prompts are sent directly to models."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of temporal leakage. MMLU (2020), HumanEval (2021), CODEXGLUE (2021) all predate the models' training cutoffs, meaning models may have seen solutions."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether evaluation setups leak information. For MMLU, few-shot examples from the dev set are provided in-context, but no analysis of whether this creates leakage."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of train/test independence for any benchmark."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method applied despite using benchmarks that predate model training."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "GPT-3.5-turbo's performance varies by up to 40% depending on prompt template in code translation tasks",
    364       "evidence": "Table 1 shows CODEXGLUE Java2CS performance ranges from 66.5 (plain text) to 78.4 (JSON) for GPT-35-turbo-0613, roughly an 18% relative difference. The 40% figure appears to reference other tasks.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "Performance differences due to format changes are statistically significant across nearly all model-benchmark combinations",
    369       "evidence": "Table 1 shows p-values < 0.01 for 27 of 28 model-benchmark combinations (exception: GPT-4-1106-preview on HumanEval, p=0.055).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Larger models (GPT-4) are more consistent across prompt formats than smaller models (GPT-3.5)",
    374       "evidence": "Figure 2 shows GPT-4 consistency scores above 0.5 vs GPT-3.5 below 0.5 on MMLU. Figure 6 shows lower CMD values for GPT-4 series across all benchmarks.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "There is no universally optimal prompt format — preferences do not transfer across GPT model generations",
    379       "evidence": "Figure 3 and Section 5.2 show IoU below 0.2 between GPT-3.5 and GPT-4 series for top templates. GPT-3.5 prefers JSON on MMLU while GPT-4 prefers Markdown (Table 1).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "GPT-4-turbo demonstrates greater resilience to prompt format changes compared to predecessors",
    384       "evidence": "Figure 6 shows GPT-4-1106-preview has CMD consistently below 0.036 across all benchmarks, lower than all other models.",
    385       "supported": "strong"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Conflict of interest not disclosed",
    391       "detail": "Five of six authors are Microsoft employees. Microsoft is a major investor in OpenAI and resells GPT models through Azure. The paper evaluates GPT models via Azure OpenAI without disclosing this financial relationship."
    392     },
    393     {
    394       "flag": "No contamination discussion",
    395       "detail": "All benchmarks (MMLU 2020, HumanEval 2021, CODEXGLUE 2021) predate the GPT models' training data. If models memorized benchmark solutions, format-dependent performance differences could reflect format-dependent recall of memorized answers rather than genuine reasoning sensitivity."
    396     },
    397     {
    398       "flag": "Multiple comparisons without correction",
    399       "detail": "28+ t-tests reported in Table 1 with no family-wise error rate correction. While most p-values are very small (< 0.001), the one borderline case (GPT-4-1106-preview on HumanEval, p=0.055) would likely not survive correction."
    400     },
    401     {
    402       "flag": "Single model family tested with broad claims",
    403       "detail": "The title asks about 'LLM Performance' broadly but only OpenAI GPT models are tested. Claims about format sensitivity may not generalize to other architectures or training approaches."
    404     },
    405     {
    406       "flag": "Variance may not represent run-to-run variability",
    407       "detail": "The ± values in Tables 3-8 appear to be standard deviation across test examples, not across multiple runs. It is unclear whether experiments were run multiple times. With temperature=0 stated only for MMLU consistency, the stochasticity of other experiments is unknown."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Quantifying language models' sensitivity to spurious features in prompt design",
    413       "authors": ["Melanie Sclar", "Yejin Choi", "Yulia Tsvetkov", "Alane Suhr"],
    414       "year": 2023,
    415       "arxiv_id": "2310.11324",
    416       "relevance": "Directly related study on LLM sensitivity to fine-grained prompt modifications like separators and capitalization."
    417     },
    418     {
    419       "title": "Mind your format: Towards consistent evaluation of in-context learning improvements",
    420       "authors": ["Anton Voronov", "Lena Wolf", "Max Ryabinin"],
    421       "year": 2024,
    422       "arxiv_id": "2401.06766",
    423       "relevance": "Shows that fixed templates in evaluation may lead to misleading conclusions about LLM capabilities."
    424     },
    425     {
    426       "title": "Evaluating large language models trained on code",
    427       "authors": ["Mark Chen"],
    428       "year": 2021,
    429       "relevance": "HumanEval benchmark used in this study, foundational for LLM code generation evaluation."
    430     },
    431     {
    432       "title": "Measuring massive multitask language understanding",
    433       "authors": ["Dan Hendrycks"],
    434       "year": 2020,
    435       "arxiv_id": "2009.03300",
    436       "relevance": "MMLU benchmark used in this study, widely used for LLM evaluation."
    437     },
    438     {
    439       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    440       "authors": ["Jason Wei"],
    441       "year": 2023,
    442       "arxiv_id": "2201.11903",
    443       "relevance": "Foundational prompting technique that interacts with prompt formatting choices."
    444     },
    445     {
    446       "title": "A systematic survey of prompt engineering in large language models: Techniques and applications",
    447       "authors": ["Pranab Sahoo"],
    448       "year": 2024,
    449       "arxiv_id": "2402.07927",
    450       "relevance": "Comprehensive survey of prompt engineering techniques in LLMs."
    451     },
    452     {
    453       "title": "Table meets LLM: Can large language models understand structured table data?",
    454       "authors": ["Yuan Sui"],
    455       "year": 2024,
    456       "relevance": "Closest related work examining format influence on LLM understanding, focused on tabular data."
    457     },
    458     {
    459       "title": "CodeGeeX: A pre-trained model for code generation with multilingual evaluations on HumanEval-X",
    460       "authors": ["Qinkai Zheng"],
    461       "year": 2023,
    462       "arxiv_id": "2303.17568",
    463       "relevance": "HumanEval-X benchmark used for multilingual code translation evaluation."
    464     },
    465     {
    466       "title": "Calibrate before use: Improving few-shot performance of language models",
    467       "authors": ["Tony Z. Zhao"],
    468       "year": 2021,
    469       "arxiv_id": "2102.09690",
    470       "relevance": "Addresses LLM sensitivity to prompt construction, related to calibration of few-shot predictions."
    471     }
    472   ]
    473 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs