scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (30519B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Do Prompts Reshape Representations? An Empirical Study of Prompting Effects on Embeddings",
      6     "authors": [
      7       "Cesar Gonzalez-Gutierrez",
      8       "Dirk Hovy"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2510.19694",
     13     "doi": "10.48550/arXiv.2510.19694"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract claims that prompting affects representation quality but changes do not consistently correlate with prompt relevance. This is supported by Figures 1, Tables 2, 6, and 7 showing inconsistent patterns across tasks and models.",
     21         "source": "opus"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper makes causal claims about prompting modifying representations ('prompting contextualizes sentence representations'). The ablation design (static vs. contextualized prompts, masked prompts, separator tokens) provides controlled single-variable manipulations that adequately support these claims.",
     27         "source": "opus"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The Limitations section explicitly bounds the findings: models used have 'relatively small corpora compared to those used for modern large-scale models,' the analysis focused on 'a limited set of classification tasks,' and results 'may not generalize to larger, instruction-tuned models.'",
     33         "source": "opus"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 5 discusses three alternative explanations: (1) the embedding-level perspective may be too limited to capture ICL, (2) models may not have been sufficiently pre-trained, and (3) instruction fine-tuning or RLHF may be necessary for effective prompting.",
     39         "source": "opus"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper clearly distinguishes between what it measures (probe performance on embeddings) and the broader phenomenon it studies (in-context learning). The paper explicitly states that 'the embedding-level perspective may be too limited to capture the complexities of ICL' (Section 5).",
     45         "source": "opus"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing model size, corpus size, task scope, and the static view of representations.",
     53         "source": "opus"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The Limitations section discusses specific threats: the models were pre-trained on smaller corpora than modern LLMs, the analysis adopts a static view that may miss layer dynamics, and the task set is limited to classification. These are specific to this study.",
     59         "source": "opus"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The Limitations section explicitly states what results do NOT show: findings 'may not generalize to larger, instruction-tuned models,' the analysis 'focused on a limited set of classification tasks,' and 'generalizability to other tasks, especially those that lie in more complex output spaces, remains an open question.'",
     65         "source": "opus"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Acknowledgments section lists ERC grant No 853459, EU ERDF and Comunitat Valenciana funding for ARTEMISA compute, and AGAUR recognition 2021SGR-Cat (01266 LQMC).",
     73         "source": "opus"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations clearly listed: Polytechnic University of Catalonia and Bocconi University. No product being evaluated, so no conflict.",
     79         "source": "opus"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "ERC and EU research funding bodies have no stake in the outcome of this study of prompting mechanisms.",
     85         "source": "opus"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests or financial interests statement is present in the paper.",
     91         "source": "opus"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms are defined in Sections 1 and 2: 'prompting,' 'in-context learning,' 'zero-shot learning,' 'probing,' and 'task alignment' all receive explicit definitions or formal references.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 1 closes with a bullet-point list of three explicit contributions: empirical comparison of prompt representation quality, demonstration that prompting alters representations, and the null finding on prompt relevance.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 4 explicitly contrasts this work with Park et al. (2025) and Kirsanov et al. (2025) on representational changes from prompting, explaining how this work differs from rather than just listing related papers.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper.",
    122           "source": "opus"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "All datasets used are publicly available benchmarks (Wiki Toxic, IMDB, AG News, Swahili News, RTE, Adversarial NLI) sourced from HuggingFace Datasets and standard NLP repositories, with citations and URLs provided (Section 2.2, Appendix A).",
    128           "source": "opus"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No requirements.txt, Dockerfile, or detailed environment setup listing library versions is provided. The paper mentions the models used but not the software environment.",
    134           "source": "opus"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No step-by-step reproduction instructions, README, or scripts are provided.",
    140           "source": "opus"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Figure 1 shows error bars for all results, and Table 2 reports standard deviations in subscripts. Statistical significance lines are shown in figures.",
    148           "source": "opus"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Bootstrap sampling statistics used throughout (Section 2.2), with p-values reported at p<0.05 and p<0.01 levels relative to both no-prompt and random-prompt baselines, using the boostsa library (Fornaciari et al., 2022).",
    154           "source": "opus"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "The paper reports raw performance differences (e.g., F1 and accuracy percentages) with significance tests, but does not report standardized effect sizes (Cohen's d, etc.). The differences shown are small (typically <1-2 percentage points) but no formal effect size measure is provided.",
    160           "source": "opus"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No justification for the number of datasets, models, or prompt templates chosen. No power analysis is discussed.",
    166           "source": "opus"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Standard deviations reported in subscripts in Table 2, and bootstrap-based error bars shown in Figure 1.",
    172           "source": "opus"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Two baselines are used: (1) unmodified input (no prompt) and (2) random word prompts, both clearly stated in Section 2.1.",
    180           "source": "opus"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The baselines (no prompt and random prompt) are appropriate for the research question. The study also references Lu et al. (2024) on random prompts as baselines, which is contemporary work.",
    186           "source": "opus"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Multiple ablation studies in Section 3.2: representation choice (3.2.1), task alignment metric (3.2.2), prompt structure with masked prompts and separator tokens (3.2.3), and static prompts (3.2.4).",
    192           "source": "opus"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Multiple metrics used: F1+ for Wiki Toxic, accuracy for IMDB/AG News/RTE, F1 for Arise News/Swahili News/Adversarial NLI (Figure 1), plus task alignment as an alternative metric (Section 3.2.2, Table 2).",
    198           "source": "opus"
    199         },
    200         "human_evaluation": {
    201           "applies": false,
    202           "answer": false,
    203           "justification": "Human evaluation is not relevant to this study of representation quality via automated probing.",
    204           "source": "opus"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Section 2.2 states probes are 'tested their performance on the test partition,' and Appendix A (Table 5) lists separate train/test partition sizes for all datasets.",
    210           "source": "opus"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results are broken down per dataset, per model architecture, per embedding strategy, and per prompt type across Figures 1, Tables 2, 6, and 7.",
    216           "source": "opus"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "The paper extensively discusses where the hypothesis fails: relevant prompts sometimes degrade performance, random prompts sometimes improve it, and GPT-2 consistently shows degraded performance (Section 3.1).",
    222           "source": "opus"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "The entire paper is essentially a negative result: the hypothesis that relevant prompts improve representations is not supported. This is explicitly stated in Section 3.1 and the Conclusion.",
    228           "source": "opus"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": false,
    235           "justification": "The paper specifies model families (BERT, RoBERTa, GPT-2) with citations but does not provide specific checkpoint versions, model sizes (e.g., bert-base-uncased vs bert-large), or HuggingFace model identifiers.",
    236           "source": "opus"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "All 26 prompt templates are provided verbatim in Table 1, including the 5 random prompts, 5 per task, and the no-prompt baseline.",
    242           "source": "opus"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "No hyperparameters are reported for the MaxEnt probe classifiers (L2 regularization strength) or for the models. No temperature or sampling settings mentioned.",
    248           "source": "opus"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "No agentic scaffolding is used. The paper applies prompt templates to models and extracts embeddings.",
    254           "source": "opus"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": false,
    259           "justification": "The paper does not describe how dataset samples were preprocessed before embedding. For two-input tasks with single-input prompts, it mentions concatenation, but detailed preprocessing steps are absent.",
    260           "source": "opus"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "No raw experimental data (embeddings, probe predictions, bootstrap samples) is released. Only aggregated results in figures and tables.",
    268           "source": "opus"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Section 2.2 and Appendix A describe the datasets used with citations, sources (HuggingFace), sizes, and characteristics (Table 5).",
    274           "source": "opus"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants. All data comes from standard public benchmarks.",
    280           "source": "opus"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": false,
    285           "justification": "The pipeline from raw datasets to final probe results is described at a high level (apply prompt template → generate embeddings → train probe → evaluate) but specific details like how multi-class datasets were handled for binary probes, exact train/test splits used, or any filtering are not documented.",
    286           "source": "opus"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "This paper studies representation quality via probing, not model capability on benchmarks. The probes are trained on the representations, not evaluating the LMs' task performance. Contamination of the LM training data is not relevant to the probing methodology.",
    294           "source": "opus"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": false,
    298           "answer": false,
    299           "justification": "Same as above — the paper probes embeddings rather than evaluating pre-trained model capability on benchmarks.",
    300           "source": "opus"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": false,
    304           "answer": false,
    305           "justification": "Same as above — contamination of the pre-trained models is not relevant since the paper measures representational properties, not task accuracy of the LMs.",
    306           "source": "opus"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants in this study.",
    314           "source": "opus"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants in this study.",
    320           "source": "opus"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants in this study.",
    326           "source": "opus"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants in this study.",
    332           "source": "opus"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants in this study.",
    338           "source": "opus"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants in this study.",
    344           "source": "opus"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants in this study.",
    350           "source": "opus"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "No inference cost, latency, or compute time reported for generating embeddings or training probes.",
    358           "source": "opus"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "The Acknowledgments mention ARTEMISA compute resources but no GPU hours, total compute time, or budget is stated.",
    364           "source": "opus"
    365         }
    366       },
    367       "experimental_rigor": {
    368         "seed_sensitivity_reported": {
    369           "applies": true,
    370           "answer": true,
    371           "justification": "Bootstrap sampling statistics are used across experiments, with standard deviations reported in Table 2 and error bars in Figure 1, showing variance across resampled runs.",
    372           "source": "opus"
    373         },
    374         "number_of_runs_stated": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "The paper does not state the number of bootstrap samples used or the number of probe training runs. The boostsa library is cited but its configuration is not specified.",
    378           "source": "opus"
    379         },
    380         "hyperparameter_search_budget": {
    381           "applies": true,
    382           "answer": false,
    383           "justification": "The L2 regularization strength for the MaxEnt probes is not specified, and no hyperparameter search budget is reported.",
    384           "source": "opus"
    385         },
    386         "best_config_selection_justified": {
    387           "applies": true,
    388           "answer": false,
    389           "justification": "The paper reports results across multiple configurations (layers, pooling strategies) but does not explain how the subset shown in Figure 1 (main results) was selected from the full set in Table 6.",
    390           "source": "opus"
    391         },
    392         "multiple_comparison_correction": {
    393           "applies": true,
    394           "answer": false,
    395           "justification": "Many statistical comparisons are made across prompts, models, datasets, and representation strategies, but no correction for multiple comparisons (Bonferroni, etc.) is applied.",
    396           "source": "opus"
    397         },
    398         "self_comparison_bias_addressed": {
    399           "applies": false,
    400           "answer": false,
    401           "justification": "The paper does not propose a competing system — it studies representational properties of prompting. There is no system vs. baseline comparison where author bias would apply.",
    402           "source": "opus"
    403         },
    404         "compute_budget_vs_performance": {
    405           "applies": false,
    406           "answer": false,
    407           "justification": "Compute differences across experimental conditions are negligible (same models, same datasets, slightly different prompt lengths).",
    408           "source": "opus"
    409         },
    410         "benchmark_construct_validity": {
    411           "applies": true,
    412           "answer": true,
    413           "justification": "The paper discusses whether probing truly measures what is claimed — Section 5 acknowledges 'the embedding-level perspective is too limited to capture the complexities of ICL' and the Limitations section questions whether the static view of representations is sufficient.",
    414           "source": "opus"
    415         },
    416         "scaffold_confound_addressed": {
    417           "applies": false,
    418           "answer": false,
    419           "justification": "No scaffolding is involved in this study.",
    420           "source": "opus"
    421         }
    422       },
    423       "data_leakage": {
    424         "temporal_leakage_addressed": {
    425           "applies": true,
    426           "answer": false,
    427           "justification": "The datasets used (IMDB 2011, Wiki Toxic 2017, AG News 2015) predate the models (BERT 2019, RoBERTa 2019, GPT-2 2019), meaning model training data could contain benchmark examples. This is not discussed.",
    428           "source": "opus"
    429         },
    430         "feature_leakage_addressed": {
    431           "applies": true,
    432           "answer": false,
    433           "justification": "No discussion of whether the prompt templates leak task information through features that would not be available in a true zero-shot setting.",
    434           "source": "opus"
    435         },
    436         "non_independence_addressed": {
    437           "applies": true,
    438           "answer": false,
    439           "justification": "No discussion of whether training data for the models may overlap with the evaluation datasets.",
    440           "source": "opus"
    441         },
    442         "leakage_detection_method": {
    443           "applies": true,
    444           "answer": false,
    445           "justification": "No concrete leakage detection or prevention method is used.",
    446           "source": "opus"
    447         }
    448       }
    449     }
    450   },
    451   "claims": [
    452     {
    453       "claim": "Prompting modifies sentence-level representations through contextualization, not merely by adding tokens",
    454       "evidence": "Static prompt ablation (Table 4) shows that averaging embeddings without contextualization neutralizes any prompt effect, while contextualized prompts produce statistically significant differences",
    455       "supported": "strong"
    456     },
    457     {
    458       "claim": "Changes in prompt embedding quality do not follow a consistent or predictable pattern with respect to prompt relevance to the target task",
    459       "evidence": "Figure 1 and Tables 6–7 show that across models and tasks, task-relevant prompts do not consistently outperform random or irrelevant prompts; sometimes irrelevant prompts improve or relevant prompts degrade performance",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "Random prompts can improve probe performance over the no-prompt baseline",
    464       "evidence": "Figure 1 shows statistically significant improvements with random prompts (orange significance lines) for BERT on Wiki Toxic and IMDB, echoing Lu et al. (2024)",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Model architecture has a stronger impact on probe performance than prompt relevance",
    469       "evidence": "Section 3.1 reports BERT generally shows improvements with any prompt including random, GPT-2 consistently degrades, and RoBERTa varies by dataset — architecture dominates prompt effects",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Task alignment and probing performance are strongly correlated, providing convergent evidence",
    474       "evidence": "Table 3 reports Pearson r=0.7475 (p<10^-19) and Spearman ρ=0.8412 (p<10^-28) between task alignment and probe performance across all conditions",
    475       "supported": "strong"
    476     }
    477   ],
    478   "methodology_tags": [
    479     "benchmark-eval",
    480     "observational"
    481   ],
    482   "key_findings": "Prompting contextualizes sentence representations — measurably altering embeddings — but the changes do not align with prompt relevance to the target task. Task-relevant prompts do not consistently outperform random or irrelevant prompts in probing experiments across BERT, RoBERTa, and GPT-2 on four classification tasks (toxicity, sentiment, topic, NLI). Effects are highly architecture- and dataset-dependent, with BERT improving under any prompt (including random) and GPT-2 consistently degrading. The authors conclude the embedding-level view may be insufficient to explain zero-shot ICL, and that limited pre-training scale prevents the expected prompt-relevance signal from emerging.",
    483   "red_flags": [
    484     {
    485       "flag": "Model versions unspecified",
    486       "detail": "BERT, RoBERTa, and GPT-2 are named without specific checkpoint identifiers (bert-base-uncased vs bert-large-cased, etc.), making exact reproduction impossible."
    487     },
    488     {
    489       "flag": "Effect sizes not reported",
    490       "detail": "Performance differences between prompt conditions are typically <1% absolute; statistical significance is reported but practical significance and effect sizes are not discussed."
    491     },
    492     {
    493       "flag": "No code released",
    494       "detail": "The probing pipeline, embedding generation, and task alignment computation code are not released, substantially limiting reproducibility."
    495     },
    496     {
    497       "flag": "Probe hyperparameters unspecified",
    498       "detail": "MaxEnt probes use L2 regularization but no specific regularization strength (C parameter) or solver configuration is reported, creating a hidden degree of freedom."
    499     },
    500     {
    501       "flag": "Benchmark contamination not addressed",
    502       "detail": "BERT/RoBERTa/GPT-2 may have seen IMDB (2011), AG News (2015), and Wiki Toxic (2017) during pre-training; this potential overlap is not acknowledged despite being directly relevant to what probing results measure."
    503     }
    504   ],
    505   "cited_papers": [
    506     {
    507       "title": "Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing",
    508       "relevance": "Directly foundational — defines the prompting pipeline formalization used in Section 2.1 and is the primary survey reference for the paradigm studied"
    509     },
    510     {
    511       "title": "In-Context Learning of Representations (ICLR)",
    512       "relevance": "Closely related concurrent work on LM representational changes under ICL — explicitly contrasted with this paper's approach in Section 4"
    513     },
    514     {
    515       "title": "The Geometry of Prompting: Unveiling Distinct Mechanisms of Task Adaptation in Language Models",
    516       "relevance": "Most directly related work — also studies representational changes from prompting; contrasted as focusing on class separability in large autoregressive models vs. this paper's cross-architecture probing approach"
    517     },
    518     {
    519       "title": "Strings from the Library of Babel: Random Sampling as a Strong Baseline for Prompt Optimisation",
    520       "relevance": "Key empirical precursor finding that random prompts can be competitive — directly echoed and cited for the unexpected random prompt results"
    521     },
    522     {
    523       "title": "Demystifying Prompts in Language Models via Perplexity Estimation",
    524       "relevance": "Related work on why prompts work — cited as motivation for why pre-training exposure to task-relevant language patterns matters"
    525     },
    526     {
    527       "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
    528       "relevance": "One of the three model architectures studied; also provides the probing and embedding methodology foundation"
    529     },
    530     {
    531       "title": "Analyzing Text Representations by Measuring Task Alignment",
    532       "relevance": "The task alignment metric used as secondary evaluation measure in Section 3.2.2 originates from this prior paper by the first author"
    533     },
    534     {
    535       "title": "Language Models are Few-Shot Learners (GPT-3)",
    536       "relevance": "Seminal few-shot prompting paper defining the ICL paradigm under investigation"
    537     }
    538   ],
    539   "engagement_factors": {
    540     "practical_relevance": {
    541       "score": 1,
    542       "justification": "Challenges the intuition that semantically relevant prompts improve embeddings, but uses small/old models so implications for modern instruction-tuned LLM practitioners are unclear."
    543     },
    544     "surprise_contrarian": {
    545       "score": 2,
    546       "justification": "Core finding directly contradicts the widespread assumption that task-relevant prompts produce better internal representations — a clean null result that challenges practitioner intuitions about prompt design."
    547     },
    548     "fear_safety": {
    549       "score": 0,
    550       "justification": "No AI risk or safety concerns raised."
    551     },
    552     "drama_conflict": {
    553       "score": 0,
    554       "justification": "No controversy or conflict angle; a standard academic empirical study with a null result."
    555     },
    556     "demo_ability": {
    557       "score": 1,
    558       "justification": "In principle reproducible with standard HuggingFace models and public datasets, but no code is released and hyperparameters are underspecified."
    559     },
    560     "brand_recognition": {
    561       "score": 0,
    562       "justification": "Authors are from UPC and Bocconi University — no famous lab or industry affiliation."
    563     }
    564   },
    565   "hn_data": {
    566     "threads": [
    567       {
    568         "hn_id": "42898914",
    569         "title": "Gradual Disempowerment: How Even Incremental AI Progress Poses Existential Risks",
    570         "points": 87,
    571         "comments": 84,
    572         "url": "https://news.ycombinator.com/item?id=42898914",
    573         "created_at": "2025-02-01T15:12:22Z"
    574       },
    575       {
    576         "hn_id": "38036218",
    577         "title": "Zephyr 7B",
    578         "points": 4,
    579         "comments": 0,
    580         "url": "https://news.ycombinator.com/item?id=38036218",
    581         "created_at": "2023-10-27T09:06:34Z"
    582       },
    583       {
    584         "hn_id": "25604385",
    585         "title": "Learning from Heterogeneous EEG Signals with Differentiable Channel Reordering",
    586         "points": 2,
    587         "comments": 0,
    588         "url": "https://news.ycombinator.com/item?id=25604385",
    589         "created_at": "2021-01-01T16:33:05Z"
    590       },
    591       {
    592         "hn_id": "42915646",
    593         "title": "Stack Overflow Meets Replication: Security Research Amid Evolving Code Snippets",
    594         "points": 1,
    595         "comments": 0,
    596         "url": "https://news.ycombinator.com/item?id=42915646",
    597         "created_at": "2025-02-03T06:49:46Z"
    598       }
    599     ],
    600     "top_points": 87,
    601     "total_points": 94,
    602     "total_comments": 84
    603   }
    604 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs