scan-v5.json (27095B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Do Prompts Reshape Representations? An Empirical Study of Prompting Effects on Embeddings", 6 "authors": [ 7 "Cesar Gonzalez-Gutierrez", 8 "Dirk Hovy" 9 ], 10 "year": 2025, 11 "venue": "arXiv.org", 12 "arxiv_id": "2510.19694", 13 "doi": "10.48550/arXiv.2510.19694" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "The abstract claims prompting affects representations but changes don't consistently correlate with prompt relevance — both are directly supported by the probing experiments in Section 3 across multiple models and datasets.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": true, 26 "justification": "The controlled design (same samples, varied prompts, same model) is adequate for the narrow causal claim that prompting modifies representations; the static prompt ablation (Table 4) further isolates the mechanism.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": true, 32 "justification": "The Limitations section explicitly states findings 'may not generalize to larger, instruction-tuned models' and that 'generalizability to other tasks... remains an open question.'", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": true, 38 "justification": "Section 5 discusses three alternative explanations for the unexpected behavior: embedding-level perspective may be too limited, models may be insufficiently pre-trained, and instruction fine-tuning may be necessary.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "The paper is explicit that MaxEnt probe performance is used as a proxy for 'representation quality' and introduces task alignment as a complementary metric; the distinction between probe performance and actual task performance is acknowledged throughout.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "There is a dedicated 'Limitations' section covering the static embedding perspective, small pre-training corpora relative to modern LLMs, and restricted task/dataset scope.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "Specific threats are named: models pre-trained on 'relatively small corpora compared to those used for modern large-scale models,' and results confined to 'a limited set of classification tasks and datasets such as toxicity detection, sentiment analysis, and topic classification.'", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper explicitly states it does not explain why the behavior occurs, and that findings may not extend to larger instruction-tuned models or tasks with more complex output spaces.", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": true, 72 "justification": "Funding is disclosed in Acknowledgments: ERC Horizon 2020 grant No 853459, EU ERDF/Comunitat Valenciana compute resources, and AGAUR recognition 2021SGR-Cat (01266 LQMC).", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Author affiliations are clearly listed on the first page: Polytechnic University of Catalonia (Gonzalez-Gutierrez) and Bocconi University (Hovy).", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": true, 83 "answer": true, 84 "justification": "ERC and EU ERDF are independent public research funders with no commercial stake in whether prompt relevance improves or fails to improve representations.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present anywhere in the paper.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "ICL, probing, zero-shot prompting, prompt templates, and 'representation quality' (operationalized as probe classifier performance) are all defined in Sections 1-2.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "Three explicit contributions are listed at the end of Section 1: empirical comparison of representation quality across prompt types, demonstration that prompting contextualizes representations, and the finding that prompt relevance does not predict representation quality changes.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 4 explicitly contrasts with Park et al. 2025 (LMs producing new in-context representations vs. improving existing ones) and Kirsanov et al. 2025 (class separability in large models on synthetic data vs. probing on natural benchmarks).", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": false, 121 "justification": "No code repository URL or release is mentioned anywhere in the paper.", 122 "source": "haiku" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "All datasets used (IMDB, AG News, Wiki Toxic, RTE, Adversarial NLI, etc.) are standard publicly available benchmarks sourced from HuggingFace Datasets as noted in Table 5.", 128 "source": "haiku" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "No requirements.txt, Dockerfile, or specific software environment is provided; only model papers are cited without specifying versions or package dependencies.", 134 "source": "haiku" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "No step-by-step reproduction instructions are provided; the paper describes methodology in general terms but not how to replicate the experiments from scratch.", 140 "source": "haiku" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": false, 147 "justification": "Statistical significance (p-values via bootstrap) is reported but confidence intervals or error bars are not shown on the primary probing results in Figure 1 or Table 6.", 148 "source": "haiku" 149 }, 150 "significance_tests": { 151 "applies": true, 152 "answer": true, 153 "justification": "Bootstrap sampling statistics (Berg-Kirkpatrick et al., 2012) via the boostsa library are used to compute p-values for probe performance differences, reported at p<0.05 and p<0.01 levels.", 154 "source": "haiku" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "Raw performance numbers are reported but no standardized effect sizes (Cohen's d, eta-squared) are calculated; absolute differences are typically sub-1% making practical significance unclear.", 160 "source": "haiku" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "Dataset sizes are determined by the benchmarks used; no power analysis or justification is given for why these particular datasets or the number of prompt templates (5 per task) were chosen.", 166 "source": "haiku" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": false, 171 "justification": "Standard deviations are reported in Table 2 for task alignment scores, but the primary probing results in Figure 1 and Table 6 do not include variance or spread measures.", 172 "source": "haiku" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": true, 179 "justification": "Two principled baselines are used: unmodified input ('None' prompt) and five random word prompts to control for the effect of simply adding tokens.", 180 "source": "haiku" 181 }, 182 "baselines_contemporary": { 183 "applies": true, 184 "answer": true, 185 "justification": "The baselines (no prompt and random prompt) are appropriate and principled for this type of representation analysis; the random baseline echoes Lu et al. 2024.", 186 "source": "haiku" 187 }, 188 "ablation_study": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.2 contains four ablation studies: representation choice (pooling strategies, CLS vs average), task alignment as alternative metric, prompt structure (masked tokens, [SEP] separator), and static vs. contextual prompts.", 192 "source": "haiku" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": true, 197 "justification": "Both probe performance (MaxEnt classifier accuracy/F1) and task alignment scores are used; Table 3 verifies strong correlation between the two metrics (Spearman ρ=0.84).", 198 "source": "haiku" 199 }, 200 "human_evaluation": { 201 "applies": false, 202 "answer": false, 203 "justification": "This is a computational study of embedding representations with no human evaluation component needed.", 204 "source": "haiku" 205 }, 206 "held_out_test_set": { 207 "applies": true, 208 "answer": true, 209 "justification": "Probes are trained on train partitions and evaluated on held-out test partitions of each dataset as described in Section 2.2 and Table 5.", 210 "source": "haiku" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Results are broken down by task (toxicity, sentiment, topic, NLI), dataset, model architecture, and representation strategy throughout Figure 1 and Tables 2, 6, and 7.", 216 "source": "haiku" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": true, 221 "justification": "The paper explicitly discusses cases where relevant prompts degrade performance (GPT-2 consistently degrades, RTE shows decline with most prompts) as central findings, not buried in appendices.", 222 "source": "haiku" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "The entire paper is a negative result: the hypothesis that relevant prompts improve representations is not supported, reported transparently as the main contribution.", 228 "source": "haiku" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": false, 235 "justification": "BERT, RoBERTa, and GPT-2 are cited by their original papers but specific checkpoint names (e.g., bert-base-uncased vs. bert-large) and parameter counts are never specified.", 236 "source": "haiku" 237 }, 238 "prompts_provided": { 239 "applies": true, 240 "answer": true, 241 "justification": "All 26 prompt templates (5 per task × 4 tasks, 5 random, 1 no-prompt) are fully provided in Table 1 with exact wording.", 242 "source": "haiku" 243 }, 244 "hyperparameters_reported": { 245 "applies": true, 246 "answer": false, 247 "justification": "The probe classifier type (MaxEnt with L2 regularization) is mentioned but the regularization strength C and other hyperparameters are not specified.", 248 "source": "haiku" 249 }, 250 "scaffolding_described": { 251 "applies": false, 252 "answer": false, 253 "justification": "No agentic scaffolding involved; this is a probing study on pretrained model embeddings.", 254 "source": "haiku" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": true, 259 "justification": "Tokenization and embedding strategies are described in detail: layer selection (last vs. second-to-last), token pooling (CLS vs. average vs. weighted average for GPT-2), and template application method (substitution into placeholders) are all specified.", 260 "source": "haiku" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": true, 267 "justification": "All datasets are standard publicly available benchmarks accessible via HuggingFace Datasets; dataset URLs are provided in footnotes.", 268 "source": "haiku" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Dataset sources, number of classes, class distribution, average sequence length, and train/test split sizes are documented in Table 5.", 274 "source": "haiku" 275 }, 276 "recruitment_methods_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants or recruitment; standard benchmark datasets are used.", 280 "source": "haiku" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": true, 285 "justification": "The pipeline from input text → template application → tokenization → embedding generation → probe training → test evaluation is described step-by-step in Section 2.", 286 "source": "haiku" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "Training data cutoffs for BERT, RoBERTa, and GPT-2 are not stated, and the possibility that evaluation datasets were in their pre-training corpora is not addressed.", 294 "source": "haiku" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": true, 298 "answer": false, 299 "justification": "The paper does not discuss whether pre-training corpora of BERT/RoBERTa/GPT-2 overlap with IMDB, AG News, or other evaluation datasets, which could inflate probe performance baselines.", 300 "source": "haiku" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": true, 304 "answer": false, 305 "justification": "Widely-used datasets like IMDB and AG News were likely present in pre-training corpora of BERT-era models published in 2019; this potential contamination is not discussed despite being directly relevant to probing conclusions.", 306 "source": "haiku" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": false, 312 "answer": false, 313 "justification": "No human participants in this study.", 314 "source": "haiku" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants in this study.", 320 "source": "haiku" 321 }, 322 "demographics_reported": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants in this study.", 326 "source": "haiku" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants in this study.", 332 "source": "haiku" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants in this study.", 338 "source": "haiku" 339 }, 340 "blinding_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants in this study.", 344 "source": "haiku" 345 }, 346 "attrition_reported": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants in this study.", 350 "source": "haiku" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": true, 356 "answer": false, 357 "justification": "No inference cost, latency, or GPU hours are reported; only the qualitative statement that experiments can run on 'mid-sized hardware' is provided.", 358 "source": "haiku" 359 }, 360 "compute_budget_stated": { 361 "applies": true, 362 "answer": false, 363 "justification": "The ARTEMISA compute resource is acknowledged in the Acknowledgments but no specific compute budget (GPU hours, node-hours, total cost) is stated.", 364 "source": "haiku" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Prompting modifies sentence-level representations primarily through token contextualization, not by token addition alone", 372 "evidence": "The static prompt ablation (Table 4) shows that averaging template and sample embeddings without contextualization eliminates prompting effects, confirming contextualization is the operative mechanism", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Relevant prompts do not consistently produce better representations than irrelevant or random prompts", 377 "evidence": "Figure 1 and Table 6 show no consistent pattern across tasks, datasets, or models: random prompts sometimes outperform relevant ones, and relevant prompts sometimes degrade probe performance relative to baseline", 378 "supported": "strong" 379 }, 380 { 381 "claim": "The effect of prompting on representations is highly model- and dataset-dependent", 382 "evidence": "BERT shows improvements with any prompt on Wiki Toxic/IMDB; RoBERTa behavior varies by dataset; GPT-2 consistently degrades — no single cross-model pattern holds", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Task alignment and probing performance are strongly correlated, reflecting the same underlying representational change", 387 "evidence": "Table 3 reports Pearson r=0.75 and Spearman ρ=0.84 between task alignment and probe performance (both p<10⁻¹⁹), suggesting the two metrics capture the same phenomenon", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Random prompts can improve probe performance over the no-prompt baseline, contradicting intuition", 392 "evidence": "Results throughout Figure 1 and Table 6 show statistically significant improvements from random prompts in several dataset-model combinations, echoing Lu et al. 2024", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Using smaller, non-instruction-tuned models may be insufficient to observe the hypothesized alignment between prompt relevance and representation quality", 397 "evidence": "Acknowledged as a limitation in Section 5: BERT/RoBERTa/GPT-2 pre-training corpora are much smaller than modern LLMs and no instruction fine-tuning was applied", 398 "supported": "moderate" 399 } 400 ], 401 "methodology_tags": [ 402 "observational", 403 "benchmark-eval" 404 ], 405 "key_findings": "Prompting alters sentence-level representations through token contextualization rather than mere token addition, as confirmed by a static prompt ablation where embedding averaging without contextualization eliminates all prompting effects. However, across three model architectures (BERT, RoBERTa, GPT-2), eight datasets (toxicity, sentiment, topic, NLI), and multiple pooling strategies, there is no consistent pattern showing that task-relevant prompts produce better embeddings than irrelevant or random prompts — directly refuting the paper's initial hypothesis. Random prompts sometimes outperform relevant ones, and relevant prompts sometimes degrade performance. The authors discuss three possible explanations: the embedding-level view may be too limited, the models may be too small and undertrained, or instruction fine-tuning may be necessary to produce prompt-aligned representations.", 406 "red_flags": [ 407 { 408 "flag": "Model variants unspecified", 409 "detail": "BERT, RoBERTa, and GPT-2 are cited by paper but specific checkpoint names (e.g., bert-base-uncased vs. bert-large) and parameter counts are never stated, making exact reproduction difficult." 410 }, 411 { 412 "flag": "No code released", 413 "detail": "No code repository is linked; with multiple models, pooling strategies, and datasets, reproduction requires guessing implementation decisions not documented in the paper." 414 }, 415 { 416 "flag": "Probe hyperparameters missing", 417 "detail": "MaxEnt classifier with L2 regularization is used for all probing but the regularization strength C is never specified, which could substantially affect results." 418 }, 419 { 420 "flag": "Pre-training contamination unaddressed", 421 "detail": "IMDB, AG News, and other evaluation datasets were widely available before BERT/RoBERTa/GPT-2 pre-training; the possibility that these datasets appear in pre-training corpora is not discussed, despite being directly relevant to baseline probe performance levels." 422 }, 423 { 424 "flag": "Tiny absolute effect sizes", 425 "detail": "Most probe performance differences between prompts are <1% absolute (e.g., 60.25 vs 61.55 F1+%), making practical significance questionable even where statistical significance is established via bootstrap." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Language Models are Few-Shot Learners (Brown et al., 2020)", 431 "relevance": "Foundational paper establishing prompting as a paradigm and ICL; central reference for in-context learning claims throughout." 432 }, 433 { 434 "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding (Devlin et al., 2019)", 435 "relevance": "One of three models used in experiments; defines the MLM pre-training objective and CLS token strategy studied." 436 }, 437 { 438 "title": "Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in NLP (Liu et al., 2023)", 439 "relevance": "Survey establishing the prompting pipeline formalism used as the conceptual framework for the experimental setup." 440 }, 441 { 442 "title": "In-context learning of representations (Park et al., 2025)", 443 "relevance": "Closest related work; explicitly contrasted — they study LMs producing new in-context representations while this paper studies improvement of existing ones via prompting." 444 }, 445 { 446 "title": "The geometry of prompting: Unveiling distinct mechanisms of task adaptation in language models (Kirsanov et al., 2025)", 447 "relevance": "Direct related work studying representational changes from prompting using class separability in large autoregressive models on synthetic datasets." 448 }, 449 { 450 "title": "Strings from the library of babel: Random sampling as a strong baseline for prompt optimisation (Lu et al., 2024)", 451 "relevance": "Prior work showing random prompts can be surprisingly effective, corroborated and extended by this paper's findings." 452 }, 453 { 454 "title": "In-context learning and induction heads (Olsson et al., 2022)", 455 "relevance": "Mechanistic interpretation of ICL via attention head circuits, providing theoretical background for the ICL mechanisms studied." 456 }, 457 { 458 "title": "Analysis methods in neural language processing: A survey (Belinkov and Glass, 2019)", 459 "relevance": "Survey of probing methodology that this paper builds upon as its primary analysis technique." 460 } 461 ], 462 "engagement_factors": { 463 "practical_relevance": { 464 "score": 2, 465 "justification": "Practitioners using prompt engineering need to understand whether prompt wording affects internal representations, but the inconsistency finding provides limited actionable guidance." 466 }, 467 "surprise_contrarian": { 468 "score": 3, 469 "justification": "Directly challenges the widely-held assumption that more relevant prompts produce better internal representations — the foundational intuition behind much prompt engineering practice." 470 }, 471 "fear_safety": { 472 "score": 0, 473 "justification": "No safety or risk implications; this is a mechanistic understanding study of embedding spaces." 474 }, 475 "drama_conflict": { 476 "score": 1, 477 "justification": "The negative result is notable but not controversial enough to generate community conflict; the authors are measured in their claims." 478 }, 479 "demo_ability": { 480 "score": 2, 481 "justification": "Public datasets and model weights are available via HuggingFace; a practitioner could replicate the basic probing setup, though missing hyperparameters limit exact reproduction." 482 }, 483 "brand_recognition": { 484 "score": 1, 485 "justification": "Authors are at UPC and Bocconi, not major AI labs; ERC-funded European academic work with no industry brand recognition." 486 } 487 }, 488 "hn_data": { 489 "threads": [ 490 { 491 "hn_id": "42898914", 492 "title": "Gradual Disempowerment: How Even Incremental AI Progress Poses Existential Risks", 493 "points": 87, 494 "comments": 84, 495 "url": "https://news.ycombinator.com/item?id=42898914", 496 "created_at": "2025-02-01T15:12:22Z" 497 }, 498 { 499 "hn_id": "38036218", 500 "title": "Zephyr 7B", 501 "points": 4, 502 "comments": 0, 503 "url": "https://news.ycombinator.com/item?id=38036218", 504 "created_at": "2023-10-27T09:06:34Z" 505 }, 506 { 507 "hn_id": "25604385", 508 "title": "Learning from Heterogeneous EEG Signals with Differentiable Channel Reordering", 509 "points": 2, 510 "comments": 0, 511 "url": "https://news.ycombinator.com/item?id=25604385", 512 "created_at": "2021-01-01T16:33:05Z" 513 }, 514 { 515 "hn_id": "42915646", 516 "title": "Stack Overflow Meets Replication: Security Research Amid Evolving Code Snippets", 517 "points": 1, 518 "comments": 0, 519 "url": "https://news.ycombinator.com/item?id=42915646", 520 "created_at": "2025-02-03T06:49:46Z" 521 } 522 ], 523 "top_points": 87, 524 "total_points": 94, 525 "total_comments": 84 526 } 527 }