ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (22000B)


      1 {
      2   "paper": {
      3     "title": "Concept Influence: Leveraging Interpretability to Improve Performance and Efficiency in Training Data Attribution",
      4     "authors": ["Matthew Kowal", "Gonçalo Paulo", "Louis Jaburi", "Tom Tseng", "Lev E McKinney", "Stefan Heimersheim", "Aaron David Tucker", "Adam Gleave", "Kellin Pelrine"],
      5     "year": 2026,
      6     "arxiv_id": "2602.14869"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No repository URL or code archive is provided in the paper. The paper references the Persona Vectors repository (Chen et al., 2025) and curvlinops library but does not release its own code."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper uses publicly available datasets: emergent misalignment datasets from the Persona Vectors repository (Chen et al., 2025), OpenAssistant v1 (OASST1; Köpf et al., 2023), and publicly available SAEs from Neuronpedia."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions curvlinops library and specific models but does not provide reproducible environment specifications."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Implementation details in Appendix B describe methodology but not how to reproduce the experiments."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Figure 5 shows error bars (shaded regions) for the OASST1 post-training experiment. The text mentions results 'within error bars' (Section 4.2)."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No statistical significance tests are reported. Claims like 'Concept Influence outperform classical influence functions across all domains' are based on visual comparison of bar charts without any tests."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper reports concrete effect sizes with context: e.g., '20× faster' speedup (Table 1), evil scores going from ~0.5 to ~2.3, MTBench scores from 38% to 67%, and specific timing comparisons (57s vs 1170s)."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No justification is given for the choice of sample sizes (e.g., why 1,000 examples for efficiency comparison, why top-5 most influential points, why specific dataset sizes)."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The emergent misalignment experiments (Figures 2, 11-14) show single-run bar charts without error bars or variance across runs. Only Figure 5 shows uncertainty bands. No mention of multiple runs or seeds for the main EM experiments."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper compares against classical influence functions (Grosse et al., 2023), Projection Difference (Chen et al., 2025), and Vector Filter across all experiments."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Baselines include recent methods: Persona Vectors/Projection Difference (Chen et al., 2025), influence functions scaled to LLMs (Grosse et al., 2023), and the IF-GUIDE approach (Coalson et al., 2025)."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper systematically compares variants (Vector Filter, Projection Difference, Concept Influence, Influence Functions) which are theoretically connected as successive approximations. Layer subsampling analysis in Appendix B/Figure 6 also serves as an ablation."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper uses multiple metrics: evil trait score, sycophancy score, MTBench instruction-following score, precision/recall/AUC for misalignment detection, correlation between methods, and wall-clock time."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No human evaluation is included. All evaluations use LLM judges (OpenAI API-based) and automated benchmarks. Given that the paper makes claims about semantic quality of attributions (e.g., Figure 4 qualitative comparison), human evaluation would be relevant."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The emergent misalignment evaluation uses test examples outside the finetuning domain (the core premise of emergent misalignment). The OASST1 experiment uses MTBench as an independent test set."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down per dataset (Misaligned Opinions, Bad Medical Advice, Insecure Code, GSM8k Mistakes), per model (Qwen2.5-7B, Llama3.1-8b), and per trait (evil, sycophancy)."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper discusses failure cases: 'projection-difference scores become less reliable than influence-based methods' under distribution shift (Section 4.1.1), and Vector Filter 'sometimes fails entirely to identify harmful examples' for out-of-domain datasets like GSM8K."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper reports that Concept Influence 'marginally underperforms standard influence functions at very small data fractions' in the OASST1 experiment (Section 4.2), and that vector-based methods fail in out-of-distribution settings."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Abstract claims about comparable performance with improved scalability are supported by Figures 2, 5 and Table 1. The 'order-of-magnitude faster' claim is supported by the 20x speedup in Table 1."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper makes causal claims about data filtering effects and validates them through retrain-and-evaluate experiments (remove data, retrain, measure outcome change). This is a controlled interventional design adequate for the causal claims made."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title and abstract claim broadly about 'Training Data Attribution' but experiments are limited to two 7-8B models (Qwen2.5-7B and Llama3.1-8b) on specific EM datasets and one post-training dataset. The Discussion acknowledges 'scaling these experiments across model sizes' as future work but the title/abstract do not bound claims accordingly."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 4.1.2 discusses that vector-based and gradient-based methods 'are measuring fundamentally different notions of influence' and explains why each family captures different information. The discussion section explicitly addresses when each method is expected to succeed or fail."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Specific model versions are stated: Qwen2.5-7B, Llama3.1-8b, Qwen2-7B-Base, Gemma-3-9B Instruct. These include size specifications. However, exact snapshot dates for the OpenAI LLM judge API are not specified."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper uses an OpenAI API-based LLM judge for evaluation but does not provide the actual judge prompts. It references 'the weighted log-probs trick (Betley et al., 2025)' and 'the implementation from Chen et al. [2025]' without reproducing the prompts used."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Appendix B reports key hyperparameters: damping λ=10^-3, batch size 1, max sequence length 1536 tokens, 5000 examples for Kronecker factors, LoRA finetuning for single epoch, SAE firing threshold 0.1, SAE layer 20."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No agentic scaffolding is used. This is a standard ML pipeline paper."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The paper describes dataset construction: 50% benign/50% misaligned splits, English subset of OASST1, and explains how test queries are sampled on-policy from finetuned models and ranked by trait score (Appendix B)."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "There is no dedicated limitations or threats-to-validity section. The 'Future work' paragraph in Section 5 mentions extensions but does not discuss limitations of the current work."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No specific threats to validity are discussed. The future work section mentions scaling to other model sizes and multimodal settings but frames these as opportunities rather than limitations."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to the tested models, datasets, or settings despite testing only 7-8B parameter models on specific synthetic and real-world datasets."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "Raw influence scores, model outputs, and intermediate results are not made available. Only aggregated results in figures are shown."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Data sources are clearly described: synthetic EM datasets from Chen et al. (2025) with 50/50 benign/misaligned splits, OASST1 corpus details from Köpf et al. (2023), and dataset examples in Appendix C."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants. All data comes from existing public datasets and model-generated outputs."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The pipeline is documented: finetune model → compute influence scores → rank examples → filter top/bottom K → retrain → evaluate with LLM judge. Appendix B provides implementation details."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding or acknowledgments section is present in the paper. Authors are affiliated with FAR.AI, EleutherAI, and University of Toronto but no funding sources are disclosed."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are clearly listed: FAR.AI, EleutherAI, University of Toronto."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed, so independence cannot be assessed. FAR.AI is an AI safety research organization which has a stake in the safety-relevant outcomes of this work."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "The paper evaluates pre-trained models (Qwen2.5-7B, Llama3.1-8b) on benchmarks but does not state training data cutoff dates. Relevant because the EM datasets and OASST1 could overlap with training data."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No discussion of whether the evaluation benchmarks or EM datasets could have been in the pre-training data of the models used."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "OASST1 was published in 2023 and both Qwen2.5 and Llama3.1 were trained after that. The paper does not discuss potential contamination of this benchmark in the pre-training data."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants in this study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants in this study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "Table 1 reports wall-clock time for each method (57s for Vector Filter, 142s for Projection Difference, 1170s for Concept Influence, 1161s for Influence Functions) on 1000 examples, with speedup factors."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No total computational budget (GPU hours, hardware used, total API spend for LLM judge) is stated. The paper does not specify what GPU was used for the experiments."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "Concept Influence outperforms classical influence functions across all emergent misalignment domains and traits at filtering misaligned data.",
    285       "evidence": "Figure 2 shows filtering results for Qwen2.5-7B across four EM datasets. Section 4.1.1 states this holds across additional models (Llama3.1-8b) and traits (sycophancy) in Appendix D.",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "Probe-based attribution methods (Vector Filter, Projection Difference) are first-order approximations of Concept Influence and achieve over 20x speedup.",
    290       "evidence": "Section 3.3-3.4 provides theoretical derivation. Table 1 shows 20.4x speedup for Vector Filter and 8.2x for Projection Difference on 1000 examples.",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "Vector Filter with only 5% of the OASST1 dataset achieves the best safety-capability tradeoff, matching full dataset instruction-following performance (MTBench 67) with evil score of 0.8 vs 2.3.",
    295       "evidence": "Figure 5 and Section 4.2 present these results for Qwen2-7B-Base fine-tuned on OASST1.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Vector-based methods fail under distribution shift while gradient-based methods remain robust.",
    300       "evidence": "Section 4.1.1: 'for out-of-domain datasets such as GSM8K, simple vector filtering sometimes fails entirely.' Figure 3 shows correlation structure between method families.",
    301       "supported": "moderate"
    302     }
    303   ],
    304   "methodology_tags": ["benchmark-eval"],
    305   "key_findings": "The paper introduces Concept Influence, which attributes model behavior to semantic directions (probes, SAE features) rather than individual test examples, improving training data attribution for safety-relevant behaviors. Probe-based methods are shown to be first-order approximations of Concept Influence, achieving 20x speedup with competitive performance. On synthetic emergent misalignment benchmarks with Qwen2.5-7B and Llama3.1-8b, Concept Influence outperforms classical influence functions at identifying misaligned data. On real-world post-training with OASST1, Vector Filter with 5% of data matches full-dataset instruction-following while substantially reducing harmful outputs.",
    306   "red_flags": [
    307     {
    308       "flag": "No variance/error bars on main experiments",
    309       "detail": "The emergent misalignment filtering experiments (Figure 2, the core results) show single-run bar charts without error bars or multiple seeds. Only the OASST1 experiment (Figure 5) includes uncertainty bands. Given that LoRA finetuning and LLM judge scoring can be stochastic, the absence of variance reporting on the main results is a concern."
    310     },
    311     {
    312       "flag": "No limitations section",
    313       "detail": "The paper lacks any dedicated limitations or threats-to-validity discussion. The future work paragraph frames limitations as opportunities rather than acknowledging constraints of the current evaluation."
    314     },
    315     {
    316       "flag": "LLM judge as sole evaluator",
    317       "detail": "All evaluations of misalignment traits (evil, sycophancy) rely entirely on an OpenAI API-based LLM judge. The judge prompts are not provided, and no human evaluation validates that the judge scores are meaningful. The reliability of LLM judges for measuring abstract traits like 'evilness' is not established."
    318     }
    319   ],
    320   "cited_papers": [
    321     {
    322       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned llms",
    323       "authors": ["Jan Betley", "Daniel Tan", "Niels Warncke", "Anna Sztyber-Betley", "Xuchan Bao", "Martín Soto", "Nathan Labenz", "Owain Evans"],
    324       "year": 2025,
    325       "arxiv_id": "2502.17424",
    326       "relevance": "Core benchmark for evaluating training data attribution methods on misalignment detection in LLMs."
    327     },
    328     {
    329       "title": "Studying large language model generalization with influence functions",
    330       "authors": ["Roger Grosse", "Juhan Bae", "Cem Anil", "Nelson Elhage"],
    331       "year": 2023,
    332       "arxiv_id": "2308.03296",
    333       "relevance": "Key baseline: scaling influence functions to LLMs, providing the EK-FAC approach this paper builds upon."
    334     },
    335     {
    336       "title": "Sleeper agents: Training deceptive llms that persist through safety training",
    337       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    338       "year": 2024,
    339       "arxiv_id": "2401.05566",
    340       "relevance": "Related work on persistent deceptive behaviors in LLMs that motivates training data attribution for safety."
    341     },
    342     {
    343       "title": "Persona vectors: Monitoring and controlling character traits in language models",
    344       "authors": ["Runjin Chen", "Andy Arditi", "Henry Sleight", "Owain Evans", "Jack Lindsey"],
    345       "year": 2025,
    346       "arxiv_id": "2507.21509",
    347       "relevance": "Direct baseline providing Projection Difference method and evaluation infrastructure used in this paper."
    348     },
    349     {
    350       "title": "If-guide: Influence function-guided detoxification of llms",
    351       "authors": ["Zachary Coalson", "Juhan Bae", "Nicholas Carlini", "Sanghyun Hong"],
    352       "year": 2025,
    353       "relevance": "Related work applying influence functions specifically to LLM detoxification."
    354     },
    355     {
    356       "title": "Model organisms for emergent misalignment",
    357       "authors": ["Edward Turner", "Anna Soligo", "Mia Taylor", "Senthooran Rajamanoharan", "Neel Nanda"],
    358       "year": 2025,
    359       "relevance": "Extends emergent misalignment observations across different datasets and models."
    360     },
    361     {
    362       "title": "Convergent linear representations of emergent misalignment",
    363       "authors": ["Anna Soligo", "Edward Turner", "Senthooran Rajamanoharan", "Neel Nanda"],
    364       "year": 2025,
    365       "relevance": "Discovers directions in activation space representing misaligned behavior, directly related to concept vectors used in this work."
    366     },
    367     {
    368       "title": "Persona features control emergent misalignment",
    369       "authors": ["Miles Wang", "Tom Dupré la Tour", "Olivia Watkins"],
    370       "year": 2025,
    371       "relevance": "Demonstrates persona features controlling emergent misalignment, closely related to the concept-based attribution approach."
    372     },
    373     {
    374       "title": "Do influence functions work on large language models?",
    375       "authors": ["Zhe Li", "Wei Zhao", "Yige Li", "Jun Sun"],
    376       "year": 2024,
    377       "arxiv_id": "2409.19998",
    378       "relevance": "Evaluates reliability of influence functions for LLMs, finding large influence scores can correspond to negligible semantic changes."
    379     }
    380   ]
    381 }

Impressum · Datenschutz