scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24269B)
      1 {
      2   "paper": {
      3     "title": "Do Prompts Reshape Representations? An Empirical Study of Prompting Effects on Embeddings",
      4     "authors": ["Cesar Gonzalez-Gutierrez", "Dirk Hovy"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.19694",
      8     "doi": "10.48550/arXiv.2510.19694"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Prompting modifies sentence-level representations through contextualization rather than just adding tokens, but changes in representation quality do not consistently correlate with prompt relevance to the target task. Task-relevant prompts do not reliably improve representations over irrelevant or random prompts across models (BERT, RoBERTa, GPT-2) and tasks (toxicity, sentiment, topic, NLI). Static (non-contextualized) prompts neutralize prompting effects, confirming that contextualization within the model is necessary.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All datasets used are publicly available benchmarks (Wiki Toxic, IMDB, AG News, Swahili News, RTE, Adversarial NLI) sourced from HuggingFace Datasets and standard NLP repositories, with citations and URLs provided (Section 2.2, Appendix A)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment setup listing library versions is provided. The paper mentions the models used but not the software environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Figure 1 shows error bars for all results, and Table 2 reports standard deviations in subscripts. Statistical significance lines are shown in figures."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Bootstrap sampling statistics used throughout (Section 2.2), with p-values reported at p<0.05 and p<0.01 levels relative to both no-prompt and random-prompt baselines, using the boostsa library (Fornaciari et al., 2022)."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper reports raw performance differences (e.g., F1 and accuracy percentages) with significance tests, but does not report standardized effect sizes (Cohen's d, etc.). The differences shown are small (typically <1-2 percentage points) but no formal effect size measure is provided."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for the number of datasets, models, or prompt templates chosen. No power analysis is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviations reported in subscripts in Table 2, and bootstrap-based error bars shown in Figure 1."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Two baselines are used: (1) unmodified input (no prompt) and (2) random word prompts, both clearly stated in Section 2.1."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baselines (no prompt and random prompt) are appropriate for the research question. The study also references Lu et al. (2024) on random prompts as baselines, which is contemporary work."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple ablation studies in Section 3.2: representation choice (3.2.1), task alignment metric (3.2.2), prompt structure with masked prompts and separator tokens (3.2.3), and static prompts (3.2.4)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics used: F1+ for Wiki Toxic, accuracy for IMDB/AG News/RTE, F1 for Arise News/Swahili News/Adversarial NLI (Figure 1), plus task alignment as an alternative metric (Section 3.2.2, Table 2)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to this study of representation quality via automated probing."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 2.2 states probes are 'tested their performance on the test partition,' and Appendix A (Table 5) lists separate train/test partition sizes for all datasets."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per dataset, per model architecture, per embedding strategy, and per prompt type across Figures 1, Tables 2, 6, and 7."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper extensively discusses where the hypothesis fails: relevant prompts sometimes degrade performance, random prompts sometimes improve it, and GPT-2 consistently shows degraded performance (Section 3.1)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The entire paper is essentially a negative result: the hypothesis that relevant prompts improve representations is not supported. This is explicitly stated in Section 3.1 and the Conclusion."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims that prompting affects representation quality but changes do not consistently correlate with prompt relevance. This is supported by Figures 1, Tables 2, 6, and 7 showing inconsistent patterns across tasks and models."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims about prompting modifying representations ('prompting contextualizes sentence representations'). The ablation design (static vs. contextualized prompts, masked prompts, separator tokens) provides controlled single-variable manipulations that adequately support these claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Limitations section explicitly bounds the findings: models used have 'relatively small corpora compared to those used for modern large-scale models,' the analysis focused on 'a limited set of classification tasks,' and results 'may not generalize to larger, instruction-tuned models.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 5 discusses three alternative explanations: (1) the embedding-level perspective may be too limited to capture ICL, (2) models may not have been sufficiently pre-trained, and (3) instruction fine-tuning or RLHF may be necessary for effective prompting."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper clearly distinguishes between what it measures (probe performance on embeddings) and the broader phenomenon it studies (in-context learning). The paper explicitly states that 'the embedding-level perspective may be too limited to capture the complexities of ICL' (Section 5)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper specifies model families (BERT, RoBERTa, GPT-2) with citations but does not provide specific checkpoint versions, model sizes (e.g., bert-base-uncased vs bert-large), or HuggingFace model identifiers."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "All 26 prompt templates are provided verbatim in Table 1, including the 5 random prompts, 5 per task, and the no-prompt baseline."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters are reported for the MaxEnt probe classifiers (L2 regularization strength) or for the models. No temperature or sampling settings mentioned."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper applies prompt templates to models and extracts embeddings."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper does not describe how dataset samples were preprocessed before embedding. For two-input tasks with single-input prompts, it mentions concatenation, but detailed preprocessing steps are absent."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing model size, corpus size, task scope, and the static view of representations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The Limitations section discusses specific threats: the models were pre-trained on smaller corpora than modern LLMs, the analysis adopts a static view that may miss layer dynamics, and the task set is limited to classification. These are specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The Limitations section explicitly states what results do NOT show: findings 'may not generalize to larger, instruction-tuned models,' the analysis 'focused on a limited set of classification tasks,' and 'generalizability to other tasks, especially those that lie in more complex output spaces, remains an open question.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (embeddings, probe predictions, bootstrap samples) is released. Only aggregated results in figures and tables."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 2.2 and Appendix A describe the datasets used with citations, sources (HuggingFace), sizes, and characteristics (Table 5)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from raw datasets to final probe results is described at a high level (apply prompt template → generate embeddings → train probe → evaluate) but specific details like how multi-class datasets were handled for binary probes, exact train/test splits used, or any filtering are not documented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section lists ERC grant No 853459, EU ERDF and Comunitat Valenciana funding for ARTEMISA compute, and AGAUR recognition 2021SGR-Cat (01266 LQMC)."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly listed: Polytechnic University of Catalonia and Bocconi University. No product being evaluated, so no conflict."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "ERC and EU research funding bodies have no stake in the outcome of this study of prompting mechanisms."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper studies representation quality via probing, not model capability on benchmarks. The probes are trained on the representations, not evaluating the LMs' task performance. Contamination of the LM training data is not relevant to the probing methodology."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — the paper probes embeddings rather than evaluating pre-trained model capability on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above — contamination of the pre-trained models is not relevant since the paper measures representational properties, not task accuracy of the LMs."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or compute time reported for generating embeddings or training probes."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The Acknowledgments mention ARTEMISA compute resources but no GPU hours, total compute time, or budget is stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Bootstrap sampling statistics are used across experiments, with standard deviations reported in Table 2 and error bars in Figure 1, showing variance across resampled runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper does not state the number of bootstrap samples used or the number of probe training runs. The boostsa library is cited but its configuration is not specified."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The L2 regularization strength for the MaxEnt probes is not specified, and no hyperparameter search budget is reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper reports results across multiple configurations (layers, pooling strategies) but does not explain how the subset shown in Figure 1 (main results) was selected from the full set in Table 6."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Many statistical comparisons are made across prompts, models, datasets, and representation strategies, but no correction for multiple comparisons (Bonferroni, etc.) is applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "The paper does not propose a competing system — it studies representational properties of prompting. There is no system vs. baseline comparison where author bias would apply."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "Compute differences across experimental conditions are negligible (same models, same datasets, slightly different prompt lengths)."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper discusses whether probing truly measures what is claimed — Section 5 acknowledges 'the embedding-level perspective is too limited to capture the complexities of ICL' and the Limitations section questions whether the static view of representations is sufficient."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved in this study."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The datasets used (IMDB 2011, Wiki Toxic 2017, AG News 2015) predate the models (BERT 2019, RoBERTa 2019, GPT-2 2019), meaning model training data could contain benchmark examples. This is not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the prompt templates leak task information through features that would not be available in a true zero-shot setting."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether training data for the models may overlap with the evaluation datasets."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Prompting modifies sentence-level representations primarily through contextualization of tokens, not just by introducing new tokens.",
    365       "evidence": "Section 3.2.3 (masked prompt experiments, Table 7 top) shows that contextualized sample tokens alone are sufficient to build a representation. Section 3.2.4 (static prompt experiments, Table 4) shows that static prompts neutralize the prompting effect.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Changes in representation quality due to prompting do not consistently correlate with the relevance of the prompt to the target task.",
    370       "evidence": "Figure 1 shows inconsistent patterns across all four tasks and three models. Irrelevant prompts sometimes improve performance, and relevant prompts sometimes degrade it. Section 3.1 provides detailed analysis.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Task alignment changes induced by prompting parallel probing performance changes, with strong positive correlation (Pearson r=0.75, Spearman ρ=0.84).",
    375       "evidence": "Table 2 (task alignment scores) and Table 3 (correlation coefficients between task alignment and probe performance), Section 3.2.2.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Model architecture has a notable impact on how prompts affect representations: BERT shows improvements with any prompt, RoBERTa varies, and GPT-2 consistently degrades.",
    380       "evidence": "Section 3.1, Figure 1a (Wiki Toxic): BERT shows significant improvement, GPT-2 shows degraded performance across prompts.",
    381       "supported": "moderate"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "Small and dated models only",
    387       "detail": "The study uses only BERT, RoBERTa, and GPT-2 — models with a few hundred million parameters pre-trained on relatively small corpora. Findings may not generalize to modern instruction-tuned LLMs where prompting is most commonly used. The authors acknowledge this in Limitations."
    388     },
    389     {
    390       "flag": "No multiple comparison correction",
    391       "detail": "Dozens of statistical comparisons are made across prompts, models, datasets, and representation strategies without Bonferroni or similar correction. Some significant results at p<0.05 may be false positives."
    392     },
    393     {
    394       "flag": "Missing model version specificity",
    395       "detail": "Exact model checkpoints (e.g., bert-base-uncased vs bert-large-uncased) are not specified, making reproduction uncertain."
    396     }
    397   ],
    398   "cited_papers": [
    399     {
    400       "title": "Language Models are Few-Shot Learners",
    401       "authors": ["Tom B. Brown"],
    402       "year": 2020,
    403       "arxiv_id": "2005.14165",
    404       "relevance": "Foundational work on prompting and in-context learning with GPT-3, establishing the paradigm this paper investigates."
    405     },
    406     {
    407       "title": "Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing",
    408       "authors": ["Pengfei Liu", "Weizhe Yuan", "Jinlan Fu"],
    409       "year": 2023,
    410       "relevance": "Comprehensive survey of prompting methods in NLP, providing the prompting pipeline framework used in this paper."
    411     },
    412     {
    413       "title": "Strings from the Library of Babel: Random Sampling as a Strong Baseline for Prompt Optimisation",
    414       "authors": ["Yao Lu", "Jiayi Wang", "Raphael Tang"],
    415       "year": 2024,
    416       "relevance": "Found that random prompts can perform unexpectedly well, directly relevant to this paper's finding that irrelevant prompts can improve representations."
    417     },
    418     {
    419       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    420       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    421       "year": 2023,
    422       "arxiv_id": "2304.15004",
    423       "relevance": "Questions emergent abilities narrative in LLMs, relevant to understanding whether prompting effects are genuine capabilities."
    424     },
    425     {
    426       "title": "What Learning Algorithm is In-Context Learning? Investigations with Linear Models",
    427       "authors": ["Ekin Akyürek"],
    428       "year": 2023,
    429       "arxiv_id": "2211.15661",
    430       "relevance": "Studies ICL mechanisms as implicit gradient descent, an alternative algorithmic perspective on in-context learning."
    431     },
    432     {
    433       "title": "In-context Learning and Induction Heads",
    434       "authors": ["Catherine Olsson"],
    435       "year": 2022,
    436       "relevance": "Mechanistic interpretability work on transformer circuits enabling in-context learning."
    437     },
    438     {
    439       "title": "Few-shot Fine-tuning vs. In-context Learning: A Fair Comparison and Evaluation",
    440       "authors": ["Marius Mosbach"],
    441       "year": 2023,
    442       "relevance": "Compares fine-tuning and ICL approaches, relevant to understanding when prompting is effective."
    443     },
    444     {
    445       "title": "Demystifying Prompts in Language Models via Perplexity Estimation",
    446       "authors": ["Hila Gonen"],
    447       "year": 2024,
    448       "arxiv_id": "2212.04037",
    449       "relevance": "Studies the relationship between prompt perplexity and task performance, complementary perspective on why prompts work."
    450     }
    451   ]
    452 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs