scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31518B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Does Prompt Formatting Have Any Impact on LLM Performance?",
      6     "authors": [
      7       "Jia He",
      8       "Mukund Rungta",
      9       "David Koleczek",
     10       "Arshdeep Sekhon",
     11       "Franklin X Wang",
     12       "Sadid Hasan"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2411.10541",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims of 'up to 40% variation in code translation' and 'GPT-4 more robust' are supported by Table 1 (CODEXGLUE Java2CS: 78.4 vs 66.5 for GPT-35-turbo) and Figure 6 showing lower CMD for GPT-4 models.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper claims prompt format 'impacts' performance. The controlled experimental design holds content constant while varying only format, which is adequate for this causal claim. Statistical significance is tested via matched pairs t-test.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title asks about 'LLM Performance' generally but the study only tests OpenAI GPT models. The Limitations section (Section 7) acknowledges this but the title and abstract make broad claims about 'LLMs' while testing only one model family.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss alternative explanations for why formats affect performance. Section D.2 hypothesizes about the GPT-4-32k JSON failure being related to 'laziness' but does not discuss confounds like tokenization differences across formats, training data format distributions, or whether the observed differences are artifacts of specific API versions.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper directly measures benchmark scores (accuracy, pass@1, BLEU) and claims these show format sensitivity. The claims match the granularity of measurements — no proxy gap exists.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 7 'Limitations' is a dedicated section discussing three specific limitations.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 7 identifies specific limitations: (1) only GPT models tested, not LLaMA/Gemini/PaLM/Phi; (2) missing HTML/XML formats; (3) no interaction with other prompt engineering techniques like few-shot count variation.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 7 explicitly states what was NOT tested: other model families, other format types (HTML, XML), and interaction with other prompt engineering techniques. Section 2.3 also explains why GPT models specifically were chosen.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding or acknowledgments section found in the paper.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are listed: five authors from Microsoft, one from MIT. Microsoft is clearly identified.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Microsoft (employer of 5/6 authors) is a major investor in OpenAI and resells GPT models via Azure. The paper evaluates GPT models accessed through Azure OpenAI. Microsoft has a financial interest in GPT model adoption, making the funder non-independent.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement found. Microsoft's investment in OpenAI and Azure OpenAI revenue from GPT model access are undisclosed financial interests relevant to a paper evaluating GPT models.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are formally defined: 'Sensitivity' (Section 3.1, with mathematical definition), 'Consistency' (Section 4.1, with formula), 'Transferability' via IoU (Section 5.1). Prompt format is defined as structural variations (plain text, Markdown, JSON, YAML) with identical content.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Contributions are explicitly stated: (1) first to compare prompt format impact across diverse GPT models, (2) extensive analysis across NL2NL, NL2Code, Code2Code tasks, (3) evaluation showing GPT-4-turbo is less susceptible to format variations. Three research questions (Sensitivity, Consistency, Transferability) frame the work.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section A 'Related Work' surveys prompt engineering (few-shot, CoT, RAG, ReAct) and prompt format sensitivity literature. Paper explicitly contrasts its approach: 'Our study diverges...by examining global prompt format modifications' versus prior work on 'subtle, local changes' like colons or newlines.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No repository URL, code archive, or mention of code release found anywhere in the paper.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The study uses publicly available benchmarks: MMLU, HumanEval, NER Finance (OpenAI Evals), FIND, CODEXGLUE, and HumanEval-X. All are standard public datasets.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment specifications, requirements files, or dependency details provided. Only mentions Azure OpenAI as the platform.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions, README, or scripts provided.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": true,
    151           "justification": "Tables 3-8 in Appendix E report results with ± notation (e.g., '59.705 ± 16.594' in Table 3).",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "One-sided matched pairs t-tests are used throughout (Section 3.1), with p-values reported in Table 1. Almost all p < 0.01.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "The paper reports absolute performance differences with baseline context (e.g., Table 1 shows max and min scores for each model/dataset, and Section 3.2 reports percentage improvements like '200% improvement' and '300% boost'). The raw scores provide sufficient context.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No justification for sample sizes. For NER Finance, '500 examples randomly sampled' with no justification. MMLU uses full test set (14,079) and HumanEval uses all 164 samples, but no power analysis or justification is given.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "Standard deviations are reported in Tables 3-8 (e.g., '59.705 ± 16.594' in Table 3 for MMLU). These appear to be across examples rather than across runs, but the ± notation is consistently provided.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The study's design compares four prompt formats against each other, with each serving as a baseline for the others. The comparison is the point of the paper.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The prompt formats compared (plain text, Markdown, JSON, YAML) are all contemporary and widely used formats. Prior work (Sclar et al. 2023, Voronov et al. 2024) is cited as context.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "The study varies a single factor (prompt format) across conditions. There is no multi-component system to ablate.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "The paper uses multiple metrics: accuracy (MMLU), pass@1 (HumanEval), F1-like entity matching (NER Finance), BLEU (CODEXGLUE, HumanEval-X), string indicator metric (FIND), plus consistency (Section 4) and IoU transferability (Section 5) as meta-metrics.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "The study evaluates prompt format sensitivity via automated benchmarks. Human evaluation of model outputs is not relevant to the claims about format sensitivity.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Standard benchmark test sets are used: MMLU dev set for few-shot examples and test set (14,079 questions) for evaluation (Appendix B). HumanEval, CODEXGLUE, etc. all have established test splits.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down per dataset (7 benchmarks), per model (4 models), and per format (4 formats). Figure 5 provides per-domain MMLU breakdown (humanities, social science, STEM, others).",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section D.2 discusses the GPT-4-32k-0613 failure on HumanEval with JSON format (21.95% vs 76.22% plain text), analyzing that the model generated chain-of-thought in plain text but failed to generate code.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper reports that no single format is universally optimal, which is itself a negative result about format transferability. The GPT-4-32k JSON failure on HumanEval is a notable negative result.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Section 2.3 specifies exact model versions: 'gpt-35-turbo-0613', 'gpt-35-turbo-16k-0613', 'gpt-4-32k-0613', and 'gpt-4-1106-preview'.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Appendix C (Table 2) provides the four prompt templates with their structure. The NER Finance examples in the appendix show actual prompt text including persona, instructions, output format, and placeholders. The fill values for the examples are described as coming from the datasets.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "Temperature is mentioned only for MMLU consistency experiment ('set the temperature to zero', Section 4.2). No systematic reporting of temperature, top-p, max tokens, or other sampling parameters for the main experiments across all benchmarks.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used. The study sends prompts directly to GPT models via Azure OpenAI API.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Appendix B describes dataset usage: MMLU dev set for few-shot examples and test set for evaluation, NER Finance 500 random samples, HumanEval all 164 samples, FIND strings category with 500 functions and 5 example pairs, CODEXGLUE 1000 test samples. The data selection and preparation is documented.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "No raw model outputs, per-example predictions, or response logs are released. Only aggregate statistics are provided.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Appendix B describes each dataset's source, size, and evaluation metric. The benchmarks used are all well-documented public datasets.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants. All data comes from standard public benchmarks.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": false,
    289           "justification": "The pipeline from benchmark data to formatted prompts to model outputs to scores is not documented step by step. How prompts were constructed programmatically, how API calls were made, and how responses were parsed is not described.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No mention of training data cutoff dates for any of the GPT models used. The paper tests models on benchmarks like HumanEval (2021) and MMLU (2020) without stating when model training data ends.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No discussion of whether the benchmark problems appeared in GPT model training data. HumanEval and MMLU were published years before the models' training cutoffs.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "MMLU (2020), HumanEval (2021), and CODEXGLUE (2021) were all published well before GPT-3.5 and GPT-4 training. No discussion of contamination risk despite this being a study of benchmark performance.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in this study.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No mention of API costs, token consumption, or inference time despite running 4 models × 4 formats × 6 benchmarks through Azure OpenAI.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No mention of total API spend, compute time, or resources used for the experiments.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "No mention of multiple random seeds. Section 4.2 mentions setting temperature to zero for MMLU consistency experiments, but no multi-seed analysis is reported for any benchmark.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "The number of experimental runs per condition is not stated. It appears results are from single runs, but this is never made explicit.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": false,
    386           "answer": false,
    387           "justification": "No hyperparameter tuning is involved. The study compares fixed prompt formats on existing benchmarks using API models.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": false,
    392           "answer": false,
    393           "justification": "No configuration selection is involved. All four formats are evaluated and reported.",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "The paper performs 28 t-tests (7 benchmarks × 4 models) in Table 1 alone, plus additional comparisons. No correction for multiple comparisons (Bonferroni, etc.) is applied.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": false,
    404           "answer": false,
    405           "justification": "The paper does not propose a new system — it compares existing prompt formats on existing models. No self-comparison bias is present.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": false,
    410           "answer": false,
    411           "justification": "All conditions use the same compute (single API call per example). Compute differences across conditions are negligible.",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "The paper uses 6 benchmarks without discussing whether they adequately capture the construct of 'LLM performance' or whether format sensitivity on these specific benchmarks generalizes to real-world usage.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": false,
    422           "answer": false,
    423           "justification": "No scaffolding is used. Prompts are sent directly to models.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "No discussion of temporal leakage. MMLU (2020), HumanEval (2021), CODEXGLUE (2021) all predate the models' training cutoffs, meaning models may have seen solutions.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of whether evaluation setups leak information. For MMLU, few-shot examples from the dev set are provided in-context, but no analysis of whether this creates leakage.",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of train/test independence for any benchmark.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No leakage detection or prevention method applied despite using benchmarks that predate model training.",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "Prompt formatting significantly impacts GPT model performance (up to 40% variation for GPT-3.5)",
    458       "evidence": "Table 1 matched pairs t-tests show p < 0.001 for most comparisons; GPT-3.5 shows 40% difference max-min on code translation (CODEXGLUE: 78.4% vs 66.5%), FIND shows 200-300% improvement across formats.",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Larger models like GPT-4 are more robust to prompt format variations than GPT-3.5",
    463       "evidence": "Consistency scores (Figure 2): GPT-3.5 < 0.5, GPT-4 > 0.5 on MMLU. Coefficient of Mean Deviation (Figure 6): GPT-4-1106-preview CMD ≤ 0.036 vs GPT-3.5 range 0.035-0.176.",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "No universal optimal format exists across different GPT models or tasks",
    468       "evidence": "GPT-3.5 prefers JSON (Table 1: MMLU 59.7% JSON vs 50.0% Markdown), GPT-4 prefers Markdown (81.2% vs 73.9% JSON). IoU scores for cross-model format overlap are low (< 0.2 for different series).",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "Format preferences differ between model generations within the same family",
    473       "evidence": "GPT-3.5-turbo models show high IoU (> 0.7) with each other, but low IoU (< 0.2) vs GPT-4 models. Figure 5 shows GPT-3.5 prefers JSON while GPT-4 prefers Markdown.",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Model size/capability correlates with robustness to format variation",
    478       "evidence": "Larger GPT-4 models exhibit lower CMD (performance dispersion) across benchmarks compared to GPT-3.5, indicating better consistency across format changes.",
    479       "supported": "moderate"
    480     },
    481     {
    482       "claim": "Format sensitivity is a general characteristic across diverse tasks, not task-specific",
    483       "evidence": "Figure 5 analysis of MMLU by domain (STEM, Humanities, Social Sciences, Other) shows performance spread exists across all domains; paper states 'sensitivity is a general characteristic, rather than contingent on specific skills.'",
    484       "supported": "moderate"
    485     }
    486   ],
    487   "methodology_tags": [
    488     "benchmark-eval"
    489   ],
    490   "key_findings": "Prompt formatting significantly impacts GPT model performance, with GPT-3.5 showing up to 40% performance variation across four structural formats (plain text, Markdown, JSON, YAML), while larger models like GPT-4 demonstrate greater robustness to these variations. No single format consistently outperforms others across models and tasks: GPT-3.5 favors JSON while GPT-4 favors Markdown. The findings challenge current evaluation practices that use fixed prompt templates and suggest practitioners should test multiple formats to accurately assess model capabilities.",
    491   "red_flags": [
    492     {
    493       "flag": "No code release",
    494       "detail": "Evaluation harness and prompting scripts are not released, limiting full reproducibility despite detailed descriptions. Makes it difficult for others to extend this work or verify exact implementation."
    495     },
    496     {
    497       "flag": "Training data contamination not addressed",
    498       "detail": "Critical gap: paper does not discuss whether MMLU, HumanEval, CODEXGLUE, and other benchmarks appear in the training data of GPT-3.5/GPT-4. These are widely-used public benchmarks likely in training sets, which could explain format sensitivity as memorization artifacts rather than genuine architectural differences."
    499     },
    500     {
    501       "flag": "Limited model diversity",
    502       "detail": "Only OpenAI GPT models tested. Title claims about 'LLM' performance are overstated. Findings may not generalize to other architectures (LLaMA, Gemini, etc.) as authors acknowledge in limitations."
    503     },
    504     {
    505       "flag": "No inference cost analysis",
    506       "detail": "Paper does not discuss computational or financial costs despite evaluating multiple models across 6 benchmarks with 4 format variants. Practitioners need cost-benefit analysis for prompt format optimization."
    507     },
    508     {
    509       "flag": "Incomplete hyperparameter documentation",
    510       "detail": "Only temperature mentioned (set to 0 for MMLU); missing top-p, frequency_penalty, max_tokens, and other OpenAI API parameters. Makes exact reproduction difficult."
    511     },
    512     {
    513       "flag": "Consistency metric artifacts",
    514       "detail": "Temperature=0 for consistency measurement removes realistic sampling variability. Consistency measured at T=0 may not reflect production use cases where temperature > 0."
    515     }
    516   ],
    517   "cited_papers": [
    518     {
    519       "title": "Quantifying language models' sensitivity to spurious features in prompt design",
    520       "authors": "Sclar et al.",
    521       "year": 2023,
    522       "relevance": "Directly related prior work on LLM sensitivity to prompt format details (local changes); this paper extends to global format variations."
    523     },
    524     {
    525       "title": "Mind your format: Towards consistent evaluation of in-context learning improvements",
    526       "authors": "Voronov et al.",
    527       "year": 2024,
    528       "relevance": "Related work on prompt format consistency and evaluation standards; motivates need for standardized format testing."
    529     },
    530     {
    531       "title": "You don't need a personality test to know these models are unreliable",
    532       "authors": "Shu et al.",
    533       "year": 2023,
    534       "relevance": "Demonstrates model reliability issues across different prompt conditions; supports need for robustness evaluation."
    535     },
    536     {
    537       "title": "Lost in the middle: How language models use long contexts",
    538       "authors": "Liu et al.",
    539       "year": 2023,
    540       "relevance": "Shows position effects in prompts; related to structural/format sensitivity phenomena."
    541     },
    542     {
    543       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    544       "authors": "Wei et al.",
    545       "year": 2023,
    546       "relevance": "Foundational prompting technique work; establishes that prompt structure affects reasoning outputs."
    547     },
    548     {
    549       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    550       "authors": "Lewis et al.",
    551       "year": 2021,
    552       "relevance": "Prompt-based architecture; relates to prompt engineering as critical lever for model control."
    553     },
    554     {
    555       "title": "ReAct: Synergizing reasoning and acting in large language models",
    556       "authors": "Yao et al.",
    557       "year": 2023,
    558       "relevance": "Demonstrates importance of prompt structure for agent behavior; format choices affect agentic capabilities."
    559     },
    560     {
    561       "title": "GPT-4 Technical Report",
    562       "authors": "Achiam et al.",
    563       "year": 2023,
    564       "relevance": "Baseline model documentation; provides architectural context for understanding format robustness differences between models."
    565     }
    566   ],
    567   "engagement_factors": {
    568     "practical_relevance": {
    569       "score": 2,
    570       "justification": "Practitioners using OpenAI GPT models need this guidance—test multiple formats, expect 40% sensitivity in smaller models, expect stability in GPT-4. Limited by vendor lock-in (GPT-only)."
    571     },
    572     "surprise_contrarian": {
    573       "score": 2,
    574       "justification": "Contradicts common assumption that 'format doesn't matter much' and that 'models understand content semantically.' The finding challenges fixed-template evaluation standards."
    575     },
    576     "fear_safety": {
    577       "score": 0,
    578       "justification": "No safety or alignment concerns raised in the paper. Format sensitivity is a robustness/reliability issue but not a safety threat."
    579     },
    580     "drama_conflict": {
    581       "score": 0,
    582       "justification": "Straightforward empirical finding with no controversial claims or competing interpretations. No drama angle."
    583     },
    584     "demo_ability": {
    585       "score": 3,
    586       "justification": "Highly demable: anyone with OpenAI API can immediately test different prompt formats on their own tasks. Results are reproducible within seconds via API calls."
    587     },
    588     "brand_recognition": {
    589       "score": 2,
    590       "justification": "Microsoft Research and MIT co-authors studying OpenAI models. Solid institutional backing but not A-list labs (OpenAI, Google DeepMind). arXiv preprint (not peer-reviewed venue)."
    591     }
    592   },
    593   "hn_data": {
    594     "threads": [
    595       {
    596         "hn_id": "42266742",
    597         "title": "The Rise and Fall of Ideas' Popularity [pdf]",
    598         "points": 3,
    599         "comments": 0,
    600         "url": "https://news.ycombinator.com/item?id=42266742",
    601         "created_at": "2024-11-28T16:54:44Z"
    602       },
    603       {
    604         "hn_id": "44854721",
    605         "title": "Does Prompt Formatting Have Any Impact on LLM Performance?",
    606         "points": 2,
    607         "comments": 0,
    608         "url": "https://news.ycombinator.com/item?id=44854721",
    609         "created_at": "2025-08-10T12:23:36Z"
    610       },
    611       {
    612         "hn_id": "45930419",
    613         "title": "A Large-Scale Computational Analysis of Errors in ArXiv Papers",
    614         "points": 1,
    615         "comments": 0,
    616         "url": "https://news.ycombinator.com/item?id=45930419",
    617         "created_at": "2025-11-14T18:52:29Z"
    618       },
    619       {
    620         "hn_id": "33707451",
    621         "title": "Knowledge Graph Generation from Text",
    622         "points": 1,
    623         "comments": 0,
    624         "url": "https://news.ycombinator.com/item?id=33707451",
    625         "created_at": "2022-11-22T16:21:57Z"
    626       }
    627     ],
    628     "top_points": 3,
    629     "total_points": 7,
    630     "total_comments": 0
    631   }
    632 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs