ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (36751B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Fine-grained Analysis of Brain-LLM Alignment through Input Attribution",
      6     "authors": [
      7       "Michela Proietti",
      8       "Roberto Capobianco",
      9       "Mariya Toneva"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2510.12355",
     14     "doi": "10.48550/arXiv.2510.12355"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims are supported: (1) BA and NWP rely on 'largely distinct word subsets' is supported by IoU analysis (Figure 2); (2) 'NWP exhibits recency and primacy biases with a focus on syntax' is supported by Figures 4–5; (3) 'BA prioritizes semantic and discourse-level information' is supported by Figure 4. The claim about generalizability is supported by MRH replication (Appendix D).",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper's main causal claim—that top-attributed words are functionally important for each task—is supported by masking experiments (Appendix C) showing performance collapse when top-1% words are replaced. Other claims are framed as correlational/attributional (e.g., 'BA relies on,' 'NWP exhibits'), avoiding unjustified causal language.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title 'Fine-grained Analysis of Brain-LLM Alignment through Input Attribution' and abstract claims about 'LLMs' are broader than the tested setting: only 1–2B parameter models were used, only two fMRI datasets (both English reading comprehension), and only frozen pretrained models. The paper acknowledges 'Future work should apply our framework to larger and more diverse models' but the title and abstract do not bound these limitations.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Multiple alternative explanations are discussed: for positional biases, the paper considers both architectural factors (RoPE, attention) and training data biases (Section 4). For Llama3.2-1B's oscillatory pattern, they test architecture vs stimulus vs context-length explanations (Appendices H, I, D.3). The Limitations section discusses sensitivity of gradient-based methods.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper clearly defines brain alignment as 'the performance of brain encoding models that predict brain activity given LLM representations, typically measured as the Pearson correlation between true and predicted activity' (Section 1). Claims are framed in terms of this specific measure rather than broader cognition claims, and the paper explicitly notes that 'our findings reflect the models' inductive biases, rather than an optimal solution.'",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "A dedicated 'Limitations' subsection appears in Section 5 (Discussion), discussing gradient-based method sensitivity, coarse discourse annotations, and the use of frozen models.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are discussed: (1) 'gradient-based attribution methods may be sensitive to local nonlinearities and model smoothness,' mitigated by cross-architecture validation and multiple methods; (2) 'discourse feature annotations used in our analysis on the HP dataset are relatively coarse and limited to predefined categories'; (3) 'we perform attributions using frozen models, without optimizing them for BA.'",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper notes model scale and architecture limitations ('Future work should apply our framework to larger and more diverse models') and frozen-model constraints, but does not explicitly state what the results do NOT show. The scope boundaries are left implicit rather than explicitly enumerated.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment or grant information is present in the paper. Authors are from Sapienza University, Sony AI, and Max Planck Institute.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly listed: Michela Proietti (Sapienza University of Rome), Roberto Capobianco (Sony AI, Zurich), Mariya Toneva (Max Planck Institute for Software Systems). The corporate affiliation (Sony AI) is disclosed.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding source is disclosed, making it impossible to assess funder independence. One author is affiliated with Sony AI, a corporate research lab, but no explicit funding relationship is stated.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Brain alignment (BA) is explicitly defined as 'the performance of brain encoding models that predict brain activity given LLM representations, measured as Pearson correlation'; input attribution methods (GXI and IG) are formally defined with equations in Appendix A.3.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Contributions are enumerated as four explicit bullet points in the introduction: novel attribution framework, architectural comparison finding, case study of BA vs NWP, and specific findings about feature reliance and positional biases.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Related work substantively engages with prior attribution approaches (Russo et al., 2022; Rahimi et al., 2025), perturbation-based methods (Toneva et al., 2022; Merlin & Toneva, 2024), and directly addresses the prior debate about NWP as BA driver (Schrimpf et al., 2021 vs Merlin & Toneva, 2024).",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The Reproducibility Statement provides a GitHub link: 'Open-source code, including data preprocessing scripts, attribution implementations, and evaluation procedures, available at https://github.com/michelaproietti/Brain-LLM-Alignment-Attribution.'",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The paper uses publicly available fMRI datasets: the HP dataset (Wehbe et al., 2014a) and the MRH dataset (Deniz et al., 2019). All five LLMs are publicly available from HuggingFace.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper reports hardware (NVIDIA H100 GPU) and mentions the Captum library and HuggingFace Transformers, but does not provide a requirements.txt, Dockerfile, or detailed dependency version list sufficient to recreate the environment.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The Reproducibility Statement points to code and describes the methodology in detail, but the paper itself does not include step-by-step reproduction instructions (e.g., specific commands to run). The reader must infer the workflow from the code repository.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Error bars representing standard error are reported in multiple figures: 'Error bars represent the standard error across contexts' (Figure 3), 'standard error across subjects' (Figure 22a), and 'standard errors across models' (Figure 4).",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Statistical significance is assessed: 'Asterisks denote significant differences (p < 0.001), assessed via a two-sided paired t-test, with Benjamini-Hochberg correction' (Figure 3). Paired t-tests are also used in Figure 22b for pairwise model comparisons.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes are reported with context: IoU values (e.g., '≈0.16' vs random baseline 'essentially zero'), AUC values for attribution spread, percentage drops in masking experiments ('more than doubles the loss' at 1%), and CoM values. These provide magnitude context for comparisons.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The paper uses 8 subjects (HP) and 9 subjects (MRH) without any justification for these sample sizes or power analysis. The only justification is that these are widely-used public datasets: 'one of the public fMRI dataset with the most data per participant.'",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Standard errors are reported across subjects, contexts, and models throughout the figures (Figures 3, 4, 22). Per-subject results are shown in Appendix F (Figures 19, 20) demonstrating inter-subject consistency.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "A random baseline is included for IoU analysis: 'for each TR and threshold t, we drew 100 pairs of random word sets matching the sizes of the BA-and NWP-top-t% sets, and averaged their IoUs' (Section 4, Figure 2). The comparison between BA and NWP itself serves as a controlled comparison.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The five LLMs tested span contemporary architectures from 2023–2024: Falcon3-1B, Gemma-2B, Llama3.2-1B, Mamba-1.4B, and Zamba2-1.2B, covering transformers, SSMs, and hybrid models.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The masking experiments in Appendix C systematically remove top-attributed words to verify their functional importance, serving as an ablation of the attribution signal. The paper also tests shorter contexts (80 words vs 640, Appendix I) and alternative attribution methods (IG vs GXI, Appendix E).",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple evaluation metrics are used: Intersection over Union (IoU) for overlap, Center of Mass (CoM) for positional patterns, attribution spread (AUC), linguistic feature distribution analysis, masking-based performance drops (CE loss for NWP, Pearson r for BA), and pairwise model significance tests.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "No human evaluation of the attribution outputs is performed. All evaluation is automated using quantitative metrics (IoU, CoM, statistical tests). Human judgment of whether the identified important words are linguistically meaningful could have been informative.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Cross-validation with held-out test folds is used: '4-fold cross-validation for HP, and 11-fold cross-validation (with each fold corresponding to one story) for MRH, and select the regularization strength via nested cross-validation' (Section 3.3.1).",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down by model (5 models), layer depth (early/middle/late), attribution threshold (1%–98%), linguistic feature type (semantic/syntactic/discourse), dataset (HP/MRH), and individual subjects (Appendix F).",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "The anomalous oscillatory attribution pattern of Llama3.2-1B is investigated in depth (Section 4, Appendices H, I), including tests with Qwen2-1.5B (similar architecture, no oscillation), shorter contexts (oscillations disappear), and a second dataset (MRH, no oscillation). Limitations of gradient-based methods are also discussed.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper reports that Llama3.2-1B's oscillatory pattern does not generalize: it disappears with shorter contexts (Appendix I), on the MRH dataset (Appendix D.3), and is absent in architecturally similar Qwen2-1.5B (Appendix H). The paper also reports that BA attribution overlap between long and short contexts is low (IoU < 0.1 for t < 80%), indicating fragility of the attribution signal.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "All five models are specified by name and size with citations to their specific releases: Falcon3-1B (Team, 2024), Gemma-2B (Mesnard et al., 2024), Llama3.2-1B (Meta AI, 2024), Mamba-1.4B (Gu & Dao, 2023), Zamba2-1.2B (Glorioso et al., 2024). These are open-source models with unique HuggingFace identifiers. Architectural details (hidden size, context length, layers) are in Table 1.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "The paper does not use prompting. Models are used as frozen feature extractors: text is fed through and layer embeddings are extracted without any prompt engineering.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Key hyperparameters are reported: context length L=640 words, IG interpolation steps m=20, cross-validation folds (4 for HP, 11 for MRH), hemodynamic delay D=4 TRs, ridge regression with nested CV for regularization. Selected layers for attribution are listed in Table 2.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used. The pipeline is a standard feature extraction → linear regression → gradient attribution framework.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Preprocessing is thoroughly documented: context construction (L=640, target word as final word), token-to-word embedding aggregation (averaging), word-to-TR downsampling (averaging per TR), hemodynamic delay concatenation (4 TRs), and the full pipeline is illustrated in Figure 6 and described in Section 3.3.1 and Appendix A.4.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Both fMRI datasets are publicly available: HP (Wehbe et al., 2014a) and MRH (Deniz et al., 2019). All models are on HuggingFace. Code is on GitHub. This enables independent verification of the full pipeline.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "The HP dataset is described: '8 subjects reading chapter 9 of Harry Potter and the Sorcerer's Stone, N=5176 words, presented one-by-one for 0.5 seconds each, fMRI sampled at TR=2 seconds, divided into four runs.' The MRH dataset is described in Appendix A.1 with similar detail.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": true,
    279           "answer": false,
    280           "justification": "The paper uses existing public fMRI datasets but does not describe how the original participants were recruited. Only the data characteristics (number of subjects, task) are reported, with references to the original data papers.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline is documented from raw data to final analysis: text → tokenization → LLM embeddings → word-level aggregation → TR-level downsampling → hemodynamic delay → ridge regression → attribution scores → aggregation to word-level. Figure 6 and Section 3.3 detail each step.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": false,
    293           "answer": false,
    294           "justification": "This paper uses frozen pre-trained LLMs as feature extractors to predict brain activity from fMRI data, not to evaluate model capability on a benchmark. Whether models saw the stimulus text (Harry Potter) during training does not constitute traditional benchmark contamination.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "The paper does not evaluate pre-trained model capability on a benchmark. LLM representations are used as inputs to a separately trained brain encoding model, so train/test overlap in the traditional contamination sense is not applicable.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Not applicable. The paper evaluates brain alignment quality of LLM representations, not model performance on a knowledge or capability benchmark.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "This paper did not conduct a human subjects study. It reuses existing public fMRI datasets (Wehbe et al., 2014; Deniz et al., 2019).",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No new human data was collected. The paper uses existing public fMRI datasets.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No new human participants were recruited. The paper references existing datasets without reporting subject demographics beyond count (8 for HP, 9 for MRH).",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No new participants were recruited. The paper uses existing public fMRI datasets.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human subjects study was conducted; this is a computational analysis using existing fMRI data.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human subjects study was conducted; no blinding is relevant.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No new human participants; the paper uses all subjects from the existing public datasets.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Appendix J (Tables 3–5) reports detailed per-task compute times and peak GPU memory for each model on each dataset. For example, GXI attribution for Llama3.2-1B on HP took 1 day 14 hours with 2.53 GB peak memory.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Appendix J summarizes total compute: '≈2 hours for representation extraction, ≈219 hours for brain alignment training, ≈1501 hours for GXI attributions, ≈329 hours for IG attribution, ≈3.6 hours for NWP attribution.' All experiments on a single NVIDIA H100 GPU (80GB).",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No multiple random seed analysis is reported. The ridge regression and gradient-based attribution procedures are largely deterministic given the data splits, but no seed sensitivity analysis is performed or discussed.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": true,
    378           "justification": "The cross-validation procedure is clearly stated: 4-fold CV for HP, 11-fold CV for MRH, with nested CV for regularization selection. The number of subjects (8 HP, 9 MRH) and averaging procedures are specified.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "Ridge regularization is selected via nested cross-validation, but the paper does not report the range of regularization values searched, the number of candidates tried, or the search method.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": true,
    390           "justification": "Layer selection is justified: 'we evaluate brain encoding performance at every layer... We divide each model into three equal-depth sections and select the layer within each section that achieves the highest average correlation across voxels and subjects' (Appendix B). Regularization is selected via nested CV.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": true,
    396           "justification": "Benjamini-Hochberg correction is applied: 'Asterisks denote significant differences (p < 0.001), assessed via a two-sided paired t-test, with Benjamini-Hochberg correction (Benjamini & Hochberg, 1995)' (Figure 3 caption).",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The paper introduces a novel attribution framework and evaluates it without discussing the bias of evaluating one's own method. No independent evaluation or explicit acknowledgment of self-comparison bias is provided.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "While compute costs are reported in Appendix J, performance is not analyzed as a function of compute budget. The varying compute costs across models (e.g., Mamba-1.4B GXI takes 11.5 days vs Llama3.2-1B at 1.6 days) are not related to brain alignment performance.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The paper does not question whether Pearson correlation between predicted and actual voxel activity is a valid measure of 'brain-LLM alignment.' The masking experiments validate the attribution method, but the higher-level construct (whether encoding model performance reflects meaningful language processing alignment) is not critically examined.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding is involved. Models are used as frozen feature extractors with identical processing pipelines.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "The Harry Potter text (published 1998) was almost certainly in the training data of all five LLMs (trained 2023–2024). The paper does not discuss whether models' familiarity with the text affects their representations and thus brain alignment scores.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "The paper does not explicitly discuss whether the evaluation setup (e.g., 640-word context windows) introduces any information leakage. The experimental design appears clean (target word is always the final word), but this is not discussed.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "All TRs and contexts come from the same continuous text (a single Harry Potter chapter), introducing temporal autocorrelation and non-independence between examples. The cross-validation approach partitions the data but does not address potential dependencies between adjacent folds.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "Brain alignment and next-word prediction rely on largely distinct subsets of input words (IoU ≈0.1–0.2 at top-10% threshold across all models)",
    455       "evidence": "Figure 2 shows IoU between BA and NWP attribution sets; at t≤10% IoU is 0.1–0.2, consistently 1.5–2x above random baseline, replicated on MRH in Appendix D.1",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "NWP exhibits strong recency and primacy biases across transformer and SSM architectures, while BA shows only a broader recency effect",
    460       "evidence": "Figures 5 and 18 show bimodal (recency + primacy) NWP distribution vs unimodal broader BA distribution across all 5 models; CoM values confirm the positional difference",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "BA draws more heavily on semantic and discourse-level features than NWP, which prioritizes syntactic features",
    465       "evidence": "Figure 4 shows at all thresholds (10%, 60%, 80%), NWP-attributed words are disproportionately syntactic while BA-attributed words contain higher proportions of semantic and discourse features; replicated with IG in Appendix E.1",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "Attribution spread is higher for NWP at early layers and reverses to favor BA at middle and late layers",
    470       "evidence": "Figure 3 AUC values show NWP>BA at early layers and BA>NWP at middle/late layers with p<0.001 by paired t-test with BH correction on HP dataset; replicated on MRH (Appendix D.2)",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "Transformers, SSMs, and hybrid architectures behave largely similarly in terms of brain alignment attribution patterns",
    475       "evidence": "Figures 5 and 18 show consistent positional and spread patterns across Falcon, Gemma, Mamba, and Zamba; Figure 22b shows no significant BA score differences between most model pairs",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "Llama3.2-1B's oscillatory BA attribution pattern is stimulus- and context-dependent rather than an architectural invariant",
    480       "evidence": "Appendix H shows Qwen2-1.5B sharing key Llama features (RoPE, GQA, FlashAttention2) lacks oscillations; Appendix I shows oscillations disappear with 80-word contexts; Appendix D.3 shows no oscillations on MRH dataset",
    481       "supported": "moderate"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "observational",
    486     "case-study"
    487   ],
    488   "key_findings": "The paper introduces the first end-to-end gradient-based attribution framework for brain-LLM alignment and applies it to show that brain alignment (BA) and next-word prediction (NWP) rely on fundamentally different input word subsets: NWP concentrates on syntactic features with sharp recency and primacy biases, while BA distributes attention more broadly with emphasis on semantic and discourse content. Attribution spread analysis reveals opposing layer-depth trends — NWP uses more distributed early-layer signals while BA requires increasingly distributed mid/late-layer signals — suggesting BA depends on higher-order linguistic representations that emerge at greater processing depth. These findings are consistent across 5 models spanning three architecture families (transformer, SSM, hybrid) and replicated on two independent fMRI datasets, strengthening the conclusion that NWP alone does not capture the full linguistic basis of brain-LLM alignment.",
    489   "red_flags": [
    490     {
    491       "flag": "Training data contamination unaddressed",
    492       "detail": "Harry Potter and the Sorcerer's Stone (the primary fMRI stimulus) was almost certainly in the training corpora of all five tested models; this contamination could systematically inflate NWP performance and skew attribution patterns in ways that are never acknowledged or discussed."
    493     },
    494     {
    495       "flag": "Small fMRI subject pool without power analysis",
    496       "detail": "Only 8 subjects for the HP dataset and 9 for MRH; while standard for fMRI studies, no power analysis is provided to justify that this N is sufficient for the cross-model and cross-layer comparisons reported."
    497     },
    498     {
    499       "flag": "Limited model scale generalizability",
    500       "detail": "All 5 models are 1-2B parameters; the paper makes broad claims about transformer and SSM behavior without qualifying that findings may not hold for larger models (7B, 70B+) with qualitatively different representation structures."
    501     },
    502     {
    503       "flag": "Causal language for correlational method",
    504       "detail": "Gradient attribution methods measure output sensitivity to inputs, not causal influence; the paper repeatedly uses language like 'BA relies on', 'NWP driven by', 'depends on' which implies causality that gradient-based methods cannot establish."
    505     }
    506   ],
    507   "cited_papers": [
    508     {
    509       "title": "Language models and brains align due to more than next-word prediction and word-level information",
    510       "relevance": "Direct predecessor showing BA and NWP decouple; this paper extends that finding from corpus-level to fine-grained word-level attribution analysis"
    511     },
    512     {
    513       "title": "The neural architecture of language: Integrative modeling converges on predictive processing",
    514       "relevance": "Key prior work arguing NWP is the primary driver of brain-LLM alignment; this paper directly challenges that claim at the word-attribution level"
    515     },
    516     {
    517       "title": "From language to cognition: How LLMs outgrow the human language network",
    518       "relevance": "Contemporaneous work showing BA and NWP decouple during training; provides temporal complement to this paper's architectural comparison"
    519     },
    520     {
    521       "title": "Token-wise decomposition of autoregressive language model hidden states for analyzing model predictions",
    522       "relevance": "Prior attribution work reporting recency effects in NWP; this paper confirms and extends with direct BA comparison"
    523     },
    524     {
    525       "title": "Joint processing of linguistic properties in brains and language models",
    526       "relevance": "Empirical work showing syntactic contributions to brain alignment; results confirmed and extended by this paper's attribution framework"
    527     },
    528     {
    529       "title": "Simultaneously uncovering the patterns of brain regions involved in different story reading subprocesses",
    530       "relevance": "Source of the Harry Potter fMRI dataset used as the primary evaluation data throughout this paper"
    531     },
    532     {
    533       "title": "Shared computational principles for language processing in humans and deep language models",
    534       "relevance": "Goldstein et al. 2022 establishing that NWP performance correlates with BA; one of the key claims this paper interrogates"
    535     }
    536   ],
    537   "engagement_factors": {
    538     "practical_relevance": {
    539       "score": 1,
    540       "justification": "The attribution method could be useful for researchers studying brain-LLM alignment, but has no direct practitioner application for building or deploying AI systems."
    541     },
    542     "surprise_contrarian": {
    543       "score": 1,
    544       "justification": "Finding that BA and NWP rely on distinct features is somewhat surprising but broadly consistent with recent work (Merlin & Toneva, 2024; AlKhamissi et al., 2025)."
    545     },
    546     "fear_safety": {
    547       "score": 0,
    548       "justification": "No AI risk or security concerns are raised; the paper is about neuroscience/NLP alignment analysis."
    549     },
    550     "drama_conflict": {
    551       "score": 0,
    552       "justification": "No controversy or confrontational framing; the paper contributes to an ongoing scientific debate about brain-LLM alignment."
    553     },
    554     "demo_ability": {
    555       "score": 1,
    556       "justification": "Code is released on GitHub but requires fMRI data processing expertise and GPU compute; not easily demoed."
    557     },
    558     "brand_recognition": {
    559       "score": 0,
    560       "justification": "Authors from Sapienza University, Sony AI, and Max Planck Institute; not headline-grabbing AI labs or well-known products."
    561     }
    562   },
    563   "hn_data": {
    564     "threads": [
    565       {
    566         "hn_id": "41929456",
    567         "title": "Quantum inspired factorization up to 100-bit RSA number in polynomial time [pdf]",
    568         "points": 4,
    569         "comments": 0,
    570         "url": "https://news.ycombinator.com/item?id=41929456"
    571       },
    572       {
    573         "hn_id": "38038429",
    574         "title": "GMEM: Generalized Memory Management for Peripheral Devices",
    575         "points": 3,
    576         "comments": 0,
    577         "url": "https://news.ycombinator.com/item?id=38038429"
    578       },
    579       {
    580         "hn_id": "42794658",
    581         "title": "Test-time regression: a unifying framework for designing sequence models",
    582         "points": 1,
    583         "comments": 0,
    584         "url": "https://news.ycombinator.com/item?id=42794658"
    585       },
    586       {
    587         "hn_id": "41933882",
    588         "title": "Quantum inspired factorization up to 100-bit RSA number in polynomial time",
    589         "points": 1,
    590         "comments": 0,
    591         "url": "https://news.ycombinator.com/item?id=41933882"
    592       }
    593     ],
    594     "top_points": 4,
    595     "total_points": 9,
    596     "total_comments": 0
    597   }
    598 }

Impressum · Datenschutz