ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (34140B)


      1 {
      2   "paper": {
      3     "title": "Fine-grained Analysis of Brain-LLM Alignment through Input Attribution",
      4     "authors": [
      5       "Michela Proietti",
      6       "Roberto Capobianco",
      7       "Mariya Toneva"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2510.12355",
     12     "doi": "10.48550/arXiv.2510.12355"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "methodology_tags": [
     20     "benchmark-eval",
     21     "observational"
     22   ],
     23   "key_findings": "Brain alignment (BA) and next-word prediction (NWP) rely on largely distinct subsets of input words, with IoU as low as 0.1–0.2 at stringent attribution thresholds. NWP exhibits strong recency and primacy biases driven primarily by syntactic features, while BA shows a broader recency pattern emphasizing semantic and discourse-level information. Attribution spread for BA increases with model depth (middle/late layers) while NWP spread decreases, suggesting BA relies on higher-order, semantically richer representations. These patterns are consistent across transformer, SSM, and hybrid architectures, and replicate on a second fMRI dataset.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The Reproducibility Statement provides a GitHub link: 'Open-source code, including data preprocessing scripts, attribution implementations, and evaluation procedures, available at https://github.com/michelaproietti/Brain-LLM-Alignment-Attribution.'"
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses publicly available fMRI datasets: the HP dataset (Wehbe et al., 2014a) and the MRH dataset (Deniz et al., 2019). All five LLMs are publicly available from HuggingFace."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper reports hardware (NVIDIA H100 GPU) and mentions the Captum library and HuggingFace Transformers, but does not provide a requirements.txt, Dockerfile, or detailed dependency version list sufficient to recreate the environment."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The Reproducibility Statement points to code and describes the methodology in detail, but the paper itself does not include step-by-step reproduction instructions (e.g., specific commands to run). The reader must infer the workflow from the code repository."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Error bars representing standard error are reported in multiple figures: 'Error bars represent the standard error across contexts' (Figure 3), 'standard error across subjects' (Figure 22a), and 'standard errors across models' (Figure 4)."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Statistical significance is assessed: 'Asterisks denote significant differences (p < 0.001), assessed via a two-sided paired t-test, with Benjamini-Hochberg correction' (Figure 3). Paired t-tests are also used in Figure 22b for pairwise model comparisons."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Effect sizes are reported with context: IoU values (e.g., '≈0.16' vs random baseline 'essentially zero'), AUC values for attribution spread, percentage drops in masking experiments ('more than doubles the loss' at 1%), and CoM values. These provide magnitude context for comparisons."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper uses 8 subjects (HP) and 9 subjects (MRH) without any justification for these sample sizes or power analysis. The only justification is that these are widely-used public datasets: 'one of the public fMRI dataset with the most data per participant.'"
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Standard errors are reported across subjects, contexts, and models throughout the figures (Figures 3, 4, 22). Per-subject results are shown in Appendix F (Figures 19, 20) demonstrating inter-subject consistency."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "A random baseline is included for IoU analysis: 'for each TR and threshold t, we drew 100 pairs of random word sets matching the sizes of the BA-and NWP-top-t% sets, and averaged their IoUs' (Section 4, Figure 2). The comparison between BA and NWP itself serves as a controlled comparison."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The five LLMs tested span contemporary architectures from 2023–2024: Falcon3-1B, Gemma-2B, Llama3.2-1B, Mamba-1.4B, and Zamba2-1.2B, covering transformers, SSMs, and hybrid models."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The masking experiments in Appendix C systematically remove top-attributed words to verify their functional importance, serving as an ablation of the attribution signal. The paper also tests shorter contexts (80 words vs 640, Appendix I) and alternative attribution methods (IG vs GXI, Appendix E)."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Multiple evaluation metrics are used: Intersection over Union (IoU) for overlap, Center of Mass (CoM) for positional patterns, attribution spread (AUC), linguistic feature distribution analysis, masking-based performance drops (CE loss for NWP, Pearson r for BA), and pairwise model significance tests."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No human evaluation of the attribution outputs is performed. All evaluation is automated using quantitative metrics (IoU, CoM, statistical tests). Human judgment of whether the identified important words are linguistically meaningful could have been informative."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Cross-validation with held-out test folds is used: '4-fold cross-validation for HP, and 11-fold cross-validation (with each fold corresponding to one story) for MRH, and select the regularization strength via nested cross-validation' (Section 3.3.1)."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by model (5 models), layer depth (early/middle/late), attribution threshold (1%–98%), linguistic feature type (semantic/syntactic/discourse), dataset (HP/MRH), and individual subjects (Appendix F)."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The anomalous oscillatory attribution pattern of Llama3.2-1B is investigated in depth (Section 4, Appendices H, I), including tests with Qwen2-1.5B (similar architecture, no oscillation), shorter contexts (oscillations disappear), and a second dataset (MRH, no oscillation). Limitations of gradient-based methods are also discussed."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that Llama3.2-1B's oscillatory pattern does not generalize: it disappears with shorter contexts (Appendix I), on the MRH dataset (Appendix D.3), and is absent in architecturally similar Qwen2-1.5B (Appendix H). The paper also reports that BA attribution overlap between long and short contexts is low (IoU < 0.1 for t < 80%), indicating fragility of the attribution signal."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "All abstract claims are supported: (1) BA and NWP rely on 'largely distinct word subsets' is supported by IoU analysis (Figure 2); (2) 'NWP exhibits recency and primacy biases with a focus on syntax' is supported by Figures 4–5; (3) 'BA prioritizes semantic and discourse-level information' is supported by Figure 4. The claim about generalizability is supported by MRH replication (Appendix D)."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper's main causal claim—that top-attributed words are functionally important for each task—is supported by masking experiments (Appendix C) showing performance collapse when top-1% words are replaced. Other claims are framed as correlational/attributional (e.g., 'BA relies on,' 'NWP exhibits'), avoiding unjustified causal language."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title 'Fine-grained Analysis of Brain-LLM Alignment through Input Attribution' and abstract claims about 'LLMs' are broader than the tested setting: only 1–2B parameter models were used, only two fMRI datasets (both English reading comprehension), and only frozen pretrained models. The paper acknowledges 'Future work should apply our framework to larger and more diverse models' but the title and abstract do not bound these limitations."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Multiple alternative explanations are discussed: for positional biases, the paper considers both architectural factors (RoPE, attention) and training data biases (Section 4). For Llama3.2-1B's oscillatory pattern, they test architecture vs stimulus vs context-length explanations (Appendices H, I, D.3). The Limitations section discusses sensitivity of gradient-based methods."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper clearly defines brain alignment as 'the performance of brain encoding models that predict brain activity given LLM representations, typically measured as the Pearson correlation between true and predicted activity' (Section 1). Claims are framed in terms of this specific measure rather than broader cognition claims, and the paper explicitly notes that 'our findings reflect the models' inductive biases, rather than an optimal solution.'"
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "All five models are specified by name and size with citations to their specific releases: Falcon3-1B (Team, 2024), Gemma-2B (Mesnard et al., 2024), Llama3.2-1B (Meta AI, 2024), Mamba-1.4B (Gu & Dao, 2023), Zamba2-1.2B (Glorioso et al., 2024). These are open-source models with unique HuggingFace identifiers. Architectural details (hidden size, context length, layers) are in Table 1."
    153       },
    154       "prompts_provided": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "The paper does not use prompting. Models are used as frozen feature extractors: text is fed through and layer embeddings are extracted without any prompt engineering."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Key hyperparameters are reported: context length L=640 words, IG interpolation steps m=20, cross-validation folds (4 for HP, 11 for MRH), hemodynamic delay D=4 TRs, ridge regression with nested CV for regularization. Selected layers for attribution are listed in Table 2."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The pipeline is a standard feature extraction → linear regression → gradient attribution framework."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Preprocessing is thoroughly documented: context construction (L=640, target word as final word), token-to-word embedding aggregation (averaging), word-to-TR downsampling (averaging per TR), hemodynamic delay concatenation (4 TRs), and the full pipeline is illustrated in Figure 6 and described in Section 3.3.1 and Appendix A.4."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "A dedicated 'Limitations' subsection appears in Section 5 (Discussion), discussing gradient-based method sensitivity, coarse discourse annotations, and the use of frozen models."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Specific threats are discussed: (1) 'gradient-based attribution methods may be sensitive to local nonlinearities and model smoothness,' mitigated by cross-architecture validation and multiple methods; (2) 'discourse feature annotations used in our analysis on the HP dataset are relatively coarse and limited to predefined categories'; (3) 'we perform attributions using frozen models, without optimizing them for BA.'"
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper notes model scale and architecture limitations ('Future work should apply our framework to larger and more diverse models') and frozen-model constraints, but does not explicitly state what the results do NOT show. The scope boundaries are left implicit rather than explicitly enumerated."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Both fMRI datasets are publicly available: HP (Wehbe et al., 2014a) and MRH (Deniz et al., 2019). All models are on HuggingFace. Code is on GitHub. This enables independent verification of the full pipeline."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The HP dataset is described: '8 subjects reading chapter 9 of Harry Potter and the Sorcerer's Stone, N=5176 words, presented one-by-one for 0.5 seconds each, fMRI sampled at TR=2 seconds, divided into four runs.' The MRH dataset is described in Appendix A.1 with similar detail."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The paper uses existing public fMRI datasets but does not describe how the original participants were recruited. Only the data characteristics (number of subjects, task) are reported, with references to the original data papers."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The full pipeline is documented from raw data to final analysis: text → tokenization → LLM embeddings → word-level aggregation → TR-level downsampling → hemodynamic delay → ridge regression → attribution scores → aggregation to word-level. Figure 6 and Section 3.3 detail each step."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding acknowledgment or grant information is present in the paper. Authors are from Sapienza University, Sony AI, and Max Planck Institute."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Michela Proietti (Sapienza University of Rome), Roberto Capobianco (Sony AI, Zurich), Mariya Toneva (Max Planck Institute for Software Systems). The corporate affiliation (Sony AI) is disclosed."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No funding source is disclosed, making it impossible to assess funder independence. One author is affiliated with Sony AI, a corporate research lab, but no explicit funding relationship is stated."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "This paper uses frozen pre-trained LLMs as feature extractors to predict brain activity from fMRI data, not to evaluate model capability on a benchmark. Whether models saw the stimulus text (Harry Potter) during training does not constitute traditional benchmark contamination."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The paper does not evaluate pre-trained model capability on a benchmark. LLM representations are used as inputs to a separately trained brain encoding model, so train/test overlap in the traditional contamination sense is not applicable."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Not applicable. The paper evaluates brain alignment quality of LLM representations, not model performance on a knowledge or capability benchmark."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "This paper did not conduct a human subjects study. It reuses existing public fMRI datasets (Wehbe et al., 2014; Deniz et al., 2019)."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No new human data was collected. The paper uses existing public fMRI datasets."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No new human participants were recruited. The paper references existing datasets without reporting subject demographics beyond count (8 for HP, 9 for MRH)."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No new participants were recruited. The paper uses existing public fMRI datasets."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study was conducted; this is a computational analysis using existing fMRI data."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human subjects study was conducted; no blinding is relevant."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No new human participants; the paper uses all subjects from the existing public datasets."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Appendix J (Tables 3–5) reports detailed per-task compute times and peak GPU memory for each model on each dataset. For example, GXI attribution for Llama3.2-1B on HP took 1 day 14 hours with 2.53 GB peak memory."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Appendix J summarizes total compute: '≈2 hours for representation extraction, ≈219 hours for brain alignment training, ≈1501 hours for GXI attributions, ≈329 hours for IG attribution, ≈3.6 hours for NWP attribution.' All experiments on a single NVIDIA H100 GPU (80GB)."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No multiple random seed analysis is reported. The ridge regression and gradient-based attribution procedures are largely deterministic given the data splits, but no seed sensitivity analysis is performed or discussed."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The cross-validation procedure is clearly stated: 4-fold CV for HP, 11-fold CV for MRH, with nested CV for regularization selection. The number of subjects (8 HP, 9 MRH) and averaging procedures are specified."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Ridge regularization is selected via nested cross-validation, but the paper does not report the range of regularization values searched, the number of candidates tried, or the search method."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Layer selection is justified: 'we evaluate brain encoding performance at every layer... We divide each model into three equal-depth sections and select the layer within each section that achieves the highest average correlation across voxels and subjects' (Appendix B). Regularization is selected via nested CV."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Benjamini-Hochberg correction is applied: 'Asterisks denote significant differences (p < 0.001), assessed via a two-sided paired t-test, with Benjamini-Hochberg correction (Benjamini & Hochberg, 1995)' (Figure 3 caption)."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper introduces a novel attribution framework and evaluates it without discussing the bias of evaluating one's own method. No independent evaluation or explicit acknowledgment of self-comparison bias is provided."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "While compute costs are reported in Appendix J, performance is not analyzed as a function of compute budget. The varying compute costs across models (e.g., Mamba-1.4B GXI takes 11.5 days vs Llama3.2-1B at 1.6 days) are not related to brain alignment performance."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not question whether Pearson correlation between predicted and actual voxel activity is a valid measure of 'brain-LLM alignment.' The masking experiments validate the attribution method, but the higher-level construct (whether encoding model performance reflects meaningful language processing alignment) is not critically examined."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is involved. Models are used as frozen feature extractors with identical processing pipelines."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The Harry Potter text (published 1998) was almost certainly in the training data of all five LLMs (trained 2023–2024). The paper does not discuss whether models' familiarity with the text affects their representations and thus brain alignment scores."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The paper does not explicitly discuss whether the evaluation setup (e.g., 640-word context windows) introduces any information leakage. The experimental design appears clean (target word is always the final word), but this is not discussed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "All TRs and contexts come from the same continuous text (a single Harry Potter chapter), introducing temporal autocorrelation and non-independence between examples. The cross-validation approach partitions the data but does not address potential dependencies between adjacent folds."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "BA and NWP rely on largely distinct subsets of input words, with IoU ≈0.1–0.2 at low attribution thresholds (t ≤ 10%).",
    375       "evidence": "Figure 2 shows IoU between BA and NWP top-attributed word sets across thresholds. At t=10%, IoU ≈0.16 while random baseline is essentially zero. Replicated on MRH dataset (Figure 10, Appendix D.1).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "BA has higher attribution spread than NWP at middle and late layers, while NWP has higher spread at early layers.",
    380       "evidence": "Figure 3 shows the number of unique words needed to reach attribution thresholds, with AUC quantifying spread. AUC for BA increases from early to late layers; NWP AUC decreases. Significance tested with paired t-tests (p < 0.001, BH-corrected). Replicated on MRH (Figure 11).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "NWP relies heavily on syntactic features, while BA draws more heavily on semantic and discourse-level information.",
    385       "evidence": "Figure 4 shows distribution of top-attributed words across linguistic categories. At t=60%, NWP marks ≈7% of semantic features as uniquely important vs BA's ≈20%. NWP shows clear syntactic bias across thresholds. Confirmed with IG attribution method (Appendix E.1).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "NWP exhibits both recency and primacy biases across architectures, while BA shows a more pronounced but broader recency effect with weaker primacy.",
    390       "evidence": "Figure 5 and Appendix F show positional distributions of top-attributed words. NWP shows sharp bimodal peaks (recency + primacy) in all 5 models. BA shows broader recency peak and weaker primacy. CoM analysis confirms BA's CoM is closer to most recent word for 4/5 models.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Transformers, SSMs, and hybrid architectures behave largely similarly in terms of brain alignment attribution patterns.",
    395       "evidence": "Results across 5 models (3 transformers, 1 SSM, 1 hybrid) show consistent IoU, attribution spread, and feature-type patterns. Figures 2, 3, 4 show per-model consistency. Llama3.2-1B is an exception with oscillatory positional patterns.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Llama3.2-1B's oscillatory attribution pattern is stimulus- and context-dependent, not an invariant architectural property.",
    400       "evidence": "The oscillatory pattern disappears with shorter contexts (80 words, Appendix I), on MRH dataset (Appendix D.3), and is absent in architecturally similar Qwen2-1.5B (Appendix H). IG produces the same pattern, ruling out attribution method artifacts (Appendix E.2).",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Top-attributed words are functionally important: masking 1% of top-attributed words virtually abolishes predictive power for both BA and NWP.",
    405       "evidence": "Figure 8 shows CE loss more than doubles when masking top 1% words for NWP. Figure 9 shows Pearson r drops by nearly 100% across all brain ROIs when masking top 1% for BA. Consistent across all 5 models.",
    406       "supported": "strong"
    407     }
    408   ],
    409   "red_flags": [
    410     {
    411       "flag": "Small human subject sample size",
    412       "detail": "Only 8 subjects (HP) and 9 subjects (MRH) are used, with no power analysis or sample size justification. The small N limits the generalizability of findings about brain-LLM alignment to the broader population. Per-subject analyses (Appendix F) show consistency, but this cannot substitute for adequate sample size."
    413     },
    414     {
    415       "flag": "Narrow model scale",
    416       "detail": "All five models are 1–2B parameters. The paper's title and abstract make general claims about 'LLMs' and 'Brain-LLM alignment,' but these findings may not hold for larger models (7B+, 70B+) that form the basis of most deployed systems and typically show qualitatively different behavior."
    417     },
    418     {
    419       "flag": "Potential text contamination confound",
    420       "detail": "All tested LLMs were almost certainly trained on Harry Potter text (published 1998, extremely widely available online). The paper does not discuss whether models' prior exposure to the stimulus text could inflate or distort brain alignment scores compared to novel text. The MRH replication mitigates this concern somewhat."
    421     },
    422     {
    423       "flag": "No competing interests statement",
    424       "detail": "One author is affiliated with Sony AI (corporate research lab), yet no funding disclosure or competing interests statement is provided."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Lost in the middle: How language models use long contexts",
    430       "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt", "Ashwin Paranjape", "Michele Bevilacqua", "Fabio Petroni", "Percy Liang"],
    431       "year": 2024,
    432       "doi": "10.1162/tacl_a_00638",
    433       "relevance": "Demonstrates positional biases (lost-in-the-middle effect) in LLM context processing, directly relevant to understanding LLM capability limitations."
    434     },
    435     {
    436       "title": "Mamba: Linear-time sequence modeling with selective state spaces",
    437       "authors": ["Albert Gu", "Tri Dao"],
    438       "year": 2023,
    439       "arxiv_id": "2312.00752",
    440       "relevance": "Introduces the Mamba SSM architecture as an alternative to transformers, relevant to understanding LLM architecture diversity."
    441     },
    442     {
    443       "title": "The Falcon 3 family of open models",
    444       "authors": ["Falcon-LLM Team"],
    445       "year": 2024,
    446       "relevance": "Describes the Falcon3 model family including distillation and pruning techniques for efficient open LLMs."
    447     },
    448     {
    449       "title": "Gemma: Open models based on gemini research and technology",
    450       "authors": ["Thomas Mesnard", "Cassidy Hardin", "Robert Dadashi"],
    451       "year": 2024,
    452       "arxiv_id": "2403.08295",
    453       "relevance": "Documents the Gemma open-weight LLM family and distillation methodology from Google."
    454     },
    455     {
    456       "title": "LoRA: Low-rank adaptation of large language models",
    457       "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis"],
    458       "year": 2022,
    459       "relevance": "Introduces LoRA parameter-efficient fine-tuning used in the Zamba2 hybrid model architecture."
    460     },
    461     {
    462       "title": "Captum: A unified and generic model interpretability library for pytorch",
    463       "authors": ["Narine Kokhlikyan", "Vivek Miglani", "Miguel Martin"],
    464       "year": 2020,
    465       "arxiv_id": "2009.07896",
    466       "relevance": "Provides the model interpretability toolkit used for gradient-based attribution methods, relevant to LLM explainability research."
    467     },
    468     {
    469       "title": "HuggingFace's Transformers: State-of-the-art natural language processing",
    470       "authors": ["Thomas Wolf", "Lysandre Debut", "Victor Sanh"],
    471       "year": 2019,
    472       "arxiv_id": "1910.03771",
    473       "relevance": "The foundational open-source library for LLM inference and experimentation used across AI research."
    474     },
    475     {
    476       "title": "From language to cognition: How LLMs outgrow the human language network",
    477       "authors": ["Badr AlKhamissi", "Greta Tuckute", "Yingtian Tang"],
    478       "year": 2025,
    479       "arxiv_id": "2503.01830",
    480       "relevance": "Shows that brain-LLM alignment and next-word prediction decouple during training, directly relevant to understanding what drives LLM capabilities beyond language modeling."
    481     },
    482     {
    483       "title": "Roformer: Enhanced transformer with rotary position embedding",
    484       "authors": ["Jianlin Su", "Murtadha Ahmed", "Yu Lu"],
    485       "year": 2024,
    486       "relevance": "Introduces RoPE positional encoding used in most modern LLMs, relevant to understanding positional biases in context processing."
    487     },
    488     {
    489       "title": "Axiomatic attribution for deep networks",
    490       "authors": ["Mukund Sundararajan", "Ankur Taly", "Qiqi Yan"],
    491       "year": 2017,
    492       "relevance": "Introduces Integrated Gradients method for model interpretability, widely used for explaining neural network predictions including LLMs."
    493     }
    494   ],
    495   "engagement_factors": {
    496     "practical_relevance": {
    497       "score": 1,
    498       "justification": "The attribution method could be useful for researchers studying brain-LLM alignment, but has no direct practitioner application for building or deploying AI systems."
    499     },
    500     "surprise_contrarian": {
    501       "score": 1,
    502       "justification": "Finding that BA and NWP rely on distinct features is somewhat surprising but broadly consistent with recent work (Merlin & Toneva, 2024; AlKhamissi et al., 2025)."
    503     },
    504     "fear_safety": {
    505       "score": 0,
    506       "justification": "No AI risk or security concerns are raised; the paper is about neuroscience/NLP alignment analysis."
    507     },
    508     "drama_conflict": {
    509       "score": 0,
    510       "justification": "No controversy or confrontational framing; the paper contributes to an ongoing scientific debate about brain-LLM alignment."
    511     },
    512     "demo_ability": {
    513       "score": 1,
    514       "justification": "Code is released on GitHub but requires fMRI data processing expertise and GPU compute; not easily demoed."
    515     },
    516     "brand_recognition": {
    517       "score": 0,
    518       "justification": "Authors from Sapienza University, Sony AI, and Max Planck Institute; not headline-grabbing AI labs or well-known products."
    519     }
    520   }
    521 }

Impressum · Datenschutz