ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20833B)


      1 {
      2   "paper": {
      3     "title": "Position: Explaining Behavioral Shifts in Large Language Models Requires a Comparative Approach",
      4     "authors": ["Martino Ciaperoni", "Marzio Di Vece", "Luca Pappalardo", "Fosca Giannotti", "Francesco Giannini"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.02304"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["theoretical", "case-study"],
     12   "key_findings": "The paper argues that behavioral shifts in LLMs (from scaling, fine-tuning, RLHF, etc.) require comparative explainability methods (∆-XAI) rather than single-checkpoint XAI. It proposes a framework with 10 desiderata organized into comparability, validity, actionability, and monitoring. A small illustrative experiment on Qwen2.5-0.5B demonstrates that fine-tuning-induced medical misalignment can be localized to late transformer layers via CKA, activation patching, and activation steering.",
     13   "claims": [
     14     {
     15       "claim": "Classical XAI methods are structurally ill-suited to explain behavioral shifts across model checkpoints because they operate on single models in isolation.",
     16       "evidence": "Argued throughout Section 1 and Section 6 (Alternative Views). The rebuttal notes XAI answers 'why this prediction' not 'what changed after intervention.'",
     17       "supported": "moderate"
     18     },
     19     {
     20       "claim": "Fine-tuning Qwen2.5-0.5B on a narrow dataset induces misaligned medical advice, and this shift is localized to the third-to-last transformer layer.",
     21       "evidence": "Section 5.2: CKA similarity shows divergence in final 3 layers (Figure 6a), activation patching at third-to-last layer increases cosine similarity to pre-fine-tuning outputs (Figure 6b), and activation steering at that layer shifts unsafe outputs toward safer advice (Figure 6c).",
     22       "supported": "moderate"
     23     },
     24     {
     25       "claim": "Activation steering using a probe-derived direction at the third-to-last layer can shift misaligned medical advice toward safer outputs without modifying model parameters.",
     26       "evidence": "Section 5.2 and Figure 6c show qualitative example with α=15 steering. Linear probe achieves >0.95 test accuracy distinguishing pre/post representations.",
     27       "supported": "weak"
     28     }
     29   ],
     30   "checklist": {
     31     "artifacts": {
     32       "code_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper provides an anonymous repository URL: https://anonymous.4open.science/r/ComparativeXAI4LLMs-7CCB for reproducing the experiment (Section 5.2)."
     36       },
     37       "data_released": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper mentions 15 prompts for the experiment and 400 expanded prompts generated by GPT-5.2, but does not provide these datasets separately. They may be in the anonymous repo but this is not explicitly stated."
     41       },
     42       "environment_specified": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No environment specifications, requirements files, or dependency versions are mentioned in the paper."
     46       },
     47       "reproduction_instructions": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper links to a repository but does not include step-by-step reproduction instructions in the paper itself. The experimental description in Section 5.2 is a narrative, not a reproducible protocol."
     51       }
     52     },
     53     "statistical_methodology": {
     54       "confidence_intervals_or_error_bars": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Figure 6b shows box plots but no confidence intervals or error bars on the main results. The CKA and cosine similarity results are reported as point estimates or aggregates without uncertainty quantification."
     58       },
     59       "significance_tests": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No statistical significance tests are reported. The activation patching results (Figure 6b) compare distributions visually via box plots but no formal test is applied."
     63       },
     64       "effect_sizes_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No formal effect sizes are reported. The paper shows qualitative and visual comparisons (CKA curves, box plots, steering examples) without quantifying effect magnitudes."
     68       },
     69       "sample_size_justified": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The experiment uses only 15 prompts with no justification for this sample size. The 400 expanded prompts for probe training are also unjustified."
     73       },
     74       "variance_reported": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No variance across runs is reported. Results appear to be from single runs. The box plots in Figure 6b show spread across prompts but not across experimental repetitions."
     78       }
     79     },
     80     "evaluation_design": {
     81       "baselines_included": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "This is a position paper proposing a framework. The illustrative experiment demonstrates the framework but does not claim superiority over alternatives, so baselines are not structurally required."
     85       },
     86       "baselines_contemporary": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No baselines needed for a position paper with illustrative experiments."
     90       },
     91       "ablation_study": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "The paper proposes a framework, not a system with separable components. The illustrative experiment demonstrates different analysis techniques but is not presenting a system to ablate."
     95       },
     96       "multiple_metrics": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The experiment uses multiple metrics: CKA similarity (Figure 6a), cosine similarity of sentence embeddings (Figure 6b), and probe classification accuracy (>0.95). Multiple analysis perspectives (feature attribution, representation similarity, activation patching, activation steering) are applied."
    100       },
    101       "human_evaluation": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No human evaluation of the steering or patching outputs is conducted. The medical advice quality is assessed only via automated metrics (cosine similarity) and qualitative cherry-picked examples."
    105       },
    106       "held_out_test_set": {
    107         "applies": false,
    108         "answer": false,
    109         "justification": "The illustrative experiment is not a benchmark evaluation requiring held-out test sets. It demonstrates analysis techniques on a small prompt set."
    110       },
    111       "per_category_breakdown": {
    112         "applies": false,
    113         "answer": false,
    114         "justification": "Not applicable to a position paper with a small illustrative experiment."
    115       },
    116       "failure_cases_discussed": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "No failure cases of the ∆-XAI approach are shown. The paper only presents successful demonstrations of localization and steering."
    120       },
    121       "negative_results_reported": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No negative results are reported. Every analysis stage (attribution, CKA, patching, steering) shows the expected positive outcome."
    125       }
    126     },
    127     "claims_and_evidence": {
    128       "abstract_claims_supported": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The abstract claims the paper formulates a ∆-XAI framework with desiderata, introduces possible pipelines, and provides a concrete experiment. All three are present in the paper (Sections 3-5)."
    132       },
    133       "causal_claims_justified": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper makes causal claims about activation patching and steering ('reveals the internal representational changes responsible for this misalignment'), but the experiment uses only 15 prompts with no controls for prompt selection bias, no statistical tests, and no bidirectional validation (mentioned as possible but not performed)."
    137       },
    138       "generalization_bounded": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The framework is presented as general for all LLM behavioral shifts, but the sole experiment uses a 0.5B parameter model (Qwen2.5-0.5B) with 15 prompts on one specific behavior (medical advice). The paper does not explicitly bound its claims to this narrow setting."
    142       },
    143       "alternative_explanations_discussed": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 6 (Alternative Views) explicitly addresses two counter-positions: that single-checkpoint XAI methods suffice, and that evaluation without explanation is enough. Each receives a substantive rebuttal."
    147       },
    148       "proxy_outcome_distinction": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper uses cosine similarity of sentence embeddings as a proxy for 'safer behavior' and CKA for 'representational change' without discussing the gap between these proxies and the actual constructs of safety and mechanistic understanding."
    152       }
    153     },
    154     "setup_transparency": {
    155       "model_versions_specified": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The paper specifies 'instruction-tuned Qwen2.5 model with 0.5 billion parameters' and 'all-MiniLM-L6-v2 sentence-transformer' (Section 5.2). The Qwen version is named with size."
    159       },
    160       "prompts_provided": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The paper mentions using 15 prompts and 400 expanded prompts but does not provide the actual prompt texts in the paper. Only one example prompt appears in Figure 6c. The full set may be in the anonymous repo but is not in the paper."
    164       },
    165       "hyperparameters_reported": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "Steering parameter α=15 is stated, but fine-tuning hyperparameters, decoding parameters (temperature, top-p), and Integrated Gradients settings are not reported."
    169       },
    170       "scaffolding_described": {
    171         "applies": false,
    172         "answer": false,
    173         "justification": "No agentic scaffolding is used. The experiment applies standard ML analysis techniques."
    174       },
    175       "data_preprocessing_documented": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not describe how the 15 prompts were selected or how the 400 GPT-5.2-expanded prompts were generated (what prompt was used to generate them, what filtering was applied)."
    179       }
    180     },
    181     "limitations_and_scope": {
    182       "limitations_section_present": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "There is no dedicated limitations section. The closest is Section 6 (Alternative Views) which addresses counter-arguments but does not discuss limitations of the framework or experiment."
    186       },
    187       "threats_to_validity_specific": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No threats to validity are discussed. The tiny sample size (15 prompts), single model, and single behavior type are not acknowledged as limitations."
    191       },
    192       "scope_boundaries_stated": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The paper does not explicitly state what the results do NOT show. The framework is presented broadly without bounding its applicability to specific model sizes, architectures, or shift types."
    196       }
    197     },
    198     "data_integrity": {
    199       "raw_data_available": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "Raw experimental data (prompts, model outputs, attribution scores, CKA values) is not provided in the paper. An anonymous repository is linked but its contents are not described."
    203       },
    204       "data_collection_described": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The origin of the 15 medical prompts is not described. The paper says '15 prompts that request medical advice' without explaining how they were selected or constructed."
    208       },
    209       "recruitment_methods_described": {
    210         "applies": false,
    211         "answer": false,
    212         "justification": "No human participants. The study uses model-generated outputs only."
    213       },
    214       "data_pipeline_documented": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "The pipeline from prompt selection to final results is not documented with sufficient detail. The expansion from 15 to 400 prompts via GPT-5.2 is mentioned but the generation process is not described."
    218       }
    219     },
    220     "conflicts_of_interest": {
    221       "funding_disclosed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding information is provided in the paper."
    225       },
    226       "affiliations_disclosed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Author affiliations are clearly listed: Scuola Normale Superiore, ISTI-CNR, and University of Pisa."
    230       },
    231       "funder_independent_of_outcome": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No funding disclosed, so independence cannot be assessed."
    235       },
    236       "financial_interests_declared": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No competing interests or financial interests statement is present in the paper."
    240       }
    241     },
    242     "contamination": {
    243       "training_cutoff_stated": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. The experiment fine-tunes a model and analyzes its internal representations; it is not testing benchmark performance."
    247       },
    248       "train_test_overlap_discussed": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No benchmark evaluation is performed. The experiment analyzes behavioral shifts, not benchmark scores."
    252       },
    253       "benchmark_contamination_addressed": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No benchmark evaluation is performed."
    257       }
    258     },
    259     "human_studies": {
    260       "pre_registered": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "irb_or_ethics_approval": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "demographics_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "inclusion_exclusion_criteria": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "randomization_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       },
    285       "blinding_described": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in this study."
    289       },
    290       "attrition_reported": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No human participants in this study."
    294       }
    295     },
    296     "cost_and_practicality": {
    297       "inference_cost_reported": {
    298         "applies": false,
    299         "answer": false,
    300         "justification": "Position/theoretical paper. The illustrative experiment is not proposing a method whose cost matters for deployment."
    301       },
    302       "compute_budget_stated": {
    303         "applies": false,
    304         "answer": false,
    305         "justification": "Position/theoretical paper with a small illustrative experiment. Compute budget is not a meaningful concern."
    306       }
    307     }
    308   },
    309   "red_flags": [
    310     {
    311       "flag": "Tiny sample size for empirical claims",
    312       "detail": "The concrete experiment uses only 15 prompts to demonstrate CKA, activation patching, and attribution analysis. This is far too small to draw reliable conclusions about where behavioral shifts are localized, yet the paper presents the results as evidence for the framework's utility."
    313     },
    314     {
    315       "flag": "Cherry-picked qualitative examples",
    316       "detail": "The activation steering result (Figure 6c) shows a single cherry-picked prompt-response pair. No systematic evaluation of steering quality across prompts is provided."
    317     },
    318     {
    319       "flag": "Missing bidirectional validation",
    320       "detail": "The paper mentions that bidirectional patching (Mpost→Mpre) 'could' be done as further validation but does not actually perform it, despite presenting the pipeline as providing causal evidence (D6)."
    321     },
    322     {
    323       "flag": "No limitations section",
    324       "detail": "For a position paper proposing a new framework, there is no discussion of the framework's own limitations, the conditions under which it might fail, or the scalability challenges of applying ∆-XAI to large models."
    325     }
    326   ],
    327   "cited_papers": [
    328     {
    329       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    330       "authors": ["J. Betley", "D. C. H. Tan", "N. Warncke", "A. Sztyber-Betley", "X. Bao", "M. Soto", "N. Labenz", "O. Evans"],
    331       "year": 2025,
    332       "relevance": "Demonstrates that narrow fine-tuning can induce broad safety failures across unrelated domains, a key motivation for ∆-XAI."
    333     },
    334     {
    335       "title": "Model organisms for emergent misalignment",
    336       "authors": ["E. Turner", "A. Soligo", "M. Taylor", "S. Rajamanoharan", "N. Nanda"],
    337       "year": 2025,
    338       "arxiv_id": "2506.11613",
    339       "relevance": "Provides the controlled setting used in the paper's experiment; shows emergent misalignment across model sizes and architectures."
    340     },
    341     {
    342       "title": "Natural emergent misalignment from reward hacking in production rl",
    343       "authors": ["M. MacDiarmid", "B. Wright", "J. Uesato", "E. Hubinger"],
    344       "year": 2025,
    345       "arxiv_id": "2511.18397",
    346       "relevance": "Demonstrates that Claude can engage in reward hacking that generalizes into lying and sabotage, motivating mechanistic explanation of behavioral shifts."
    347     },
    348     {
    349       "title": "Emergent abilities of large language models",
    350       "authors": ["J. Wei", "Y. Tay", "R. Bommasani", "C. Raffel"],
    351       "year": 2022,
    352       "relevance": "Seminal paper on emergent abilities in LLMs, a core phenomenon that ∆-XAI aims to explain."
    353     },
    354     {
    355       "title": "Are emergent abilities of large language models a mirage?",
    356       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    357       "year": 2023,
    358       "relevance": "Counter-argument that emergent abilities may be artifacts of evaluation metrics rather than intrinsic model properties."
    359     },
    360     {
    361       "title": "Convergent linear representations of emergent misalignment",
    362       "authors": ["A. Soligo", "E. Turner", "S. Rajamanoharan", "N. Nanda"],
    363       "year": 2025,
    364       "arxiv_id": "2506.11618",
    365       "relevance": "Extracts directions from activation differences associated with safety-relevant shifts, directly related to ∆-XAI mechanistic interventions."
    366     },
    367     {
    368       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    369       "authors": ["M. Mazeika", "L. Phan", "X. Yin", "A. Zou"],
    370       "year": 2024,
    371       "relevance": "Standardized safety evaluation framework for LLMs, representing the behavioral-testing approach that ∆-XAI aims to complement."
    372     },
    373     {
    374       "title": "Towards automated circuit discovery for mechanistic interpretability",
    375       "authors": ["A. Conmy", "A. Mavor-Parker", "A. Lynch", "S. Heimersheim", "A. Garriga-Alonso"],
    376       "year": 2023,
    377       "relevance": "Activation patching for mechanistic interpretability, a key technique family in the ∆-XAI framework."
    378     },
    379     {
    380       "title": "Cot is not explainability",
    381       "authors": ["F. Barez", "J. A. Hartmann", "D. Krueger"],
    382       "year": 2025,
    383       "relevance": "Argues that chain-of-thought is not faithful explanation, supporting the need for mechanistic ∆-XAI methods over self-explanations."
    384     },
    385     {
    386       "title": "Predicting emergent capabilities by finetuning",
    387       "authors": ["C. Snell", "E. Wallace", "D. Klein", "S. Levine"],
    388       "year": 2024,
    389       "arxiv_id": "2411.16035",
    390       "relevance": "Shows that task-specific fine-tuning can reduce the scale at which emergence occurs, relevant to understanding behavioral shifts."
    391     }
    392   ]
    393 }

Impressum · Datenschutz