ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (27266B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "In-the-Wild Model Organisms: Mitigating Undesirable Emergent Behaviors in Production LLM Post-Training via Data Attribution",
      6     "authors": [
      7       "Frank Xiao",
      8       "Santiago Aranguri"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.11079",
     13     "doi": null
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All abstract claims are supported: 63% reduction via filtering (Table 2: 2.86% vs 7.63% baseline at 30k), 78% via label switching (1.66%), 10× cost advantage (Table 1: $30 vs $320/$500), and causal validation through retraining is demonstrated in Section 6.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Causal claims are validated by retraining from the SFT checkpoint with modified DPO datasets, measuring actual harmful response rates after intervention—a proper causal validation design rather than correlation alone.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The limitations section explicitly bounds claims to OLMo 2 and a single behavior type, noting that validation on other model families requires access to post-training data and checkpoints that only OLMo currently releases.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper does not discuss alternative explanations for why distractor-triggered compliance emerges (e.g., whether DPO's training objective systematically amplifies this pattern, or whether the GPT-4o grading rubric creates systematic label noise); it presents one mechanistic account without considering competing interpretations.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper uses LLM-judge harmful response rate as its metric and validates this proxy against human annotations (90.6% agreement, Cohen's κ=0.81, Appendix A.9), clearly distinguishing what is measured from what is claimed.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 7 (Discussion) contains a dedicated 'Limitations' paragraph with specific constraints beyond boilerplate disclaimers.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats are named: single model family constraint tied to OLMo being the only competitive open-source model releasing training artifacts and checkpoints; human interpretation required for cluster identification; 32B validation shows behavior exists but attribution pipeline not applied at that scale.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper explicitly states it does not show generalization beyond OLMo 2 models, does not automate cluster labeling, and has not tested activation-based attribution for RLHF or SFT post-training methods.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "Acknowledgements mention SPAR (Supervised Program for Alignment Research) as providing a research environment but no formal funding source or grant numbers are disclosed.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations are stated in the header: Frank Xiao at Caltech, Santiago Aranguri at Goodfire.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Goodfire is a mechanistic interpretability startup whose commercial value depends on activation-based analysis being proven effective; Aranguri's affiliation creates a financial interest in the paper's primary finding that activation-based methods outperform alternatives.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present in the paper.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "'Model organism,' 'in-the-wild model organism,' 'distractor-triggered compliance,' and 'activation-based data attribution' are all explicitly defined with examples in Sections 1, 3, and 5.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Three numbered contributions are stated in Section 1: activation-based data attribution, unsupervised behavior discovery, and the in-the-wild model organism itself.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 substantively engages with influence functions, TracIn, TRAK, LESS, Datamodels, mechanistic interpretability, persona vectors, and prior model organism work, explicitly positioning against Minder et al. (2025)'s critique of artificially constructed model organisms.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "Section 4.2 states 'We will additionally release the visualization tool we made to inspect these visualizations upon publication'—this is a promise of future release, not a current release.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "The OLMo 2 DPO training dataset (378,341 pairs) is publicly released by AllenAI; LMSys, GSM8K, IFEval, and XSTest are all standard public datasets.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "GPU hardware (H100, 4090) and costs are mentioned but no requirements.txt, Dockerfile, Python version, or dependency specification is provided.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Appendices describe methods in technical detail but provide no step-by-step reproduction guide; key implementation details (layer selection procedure, exact rollout parameters) require inference from scattered appendix sections.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "95% bootstrap confidence intervals are reported for all harmful response rate results in Tables 2, 3, and 4, and error bars are shown in Figures 4, 14, 15, 16, and 19.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No formal statistical significance tests (t-tests, Mann-Whitney, etc.) are used; comparative claims rely on visual inspection of non-overlapping bootstrap CIs rather than explicit hypothesis testing.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Effect sizes are reported as percentage reductions from baseline (63%, 78%, 85%) with absolute harmful response rates, providing clear magnitude context.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "The evaluation set of 120 test prompts and 150 probing prompts is not statistically justified; no power analysis or sample size rationale is provided.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Bootstrap confidence intervals (95%) are consistently reported across all main results tables, and standard errors are provided for capability benchmarks in Table 3.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Three baselines are compared: Gradient-based attribution (LESS), LLM Toxic judge ranking, and Random selection, across all intervention sizes.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "LESS (Xia et al., 2024) is a recent gradient-based state-of-the-art method; the LLM judge baseline uses GPT-5-mini, a current model.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Ablations include intervention size (3k/12k/30k datapoints), M0 vs. M1 formulation for probing vector, layer selection (16–26), and additional ranking methods (Max over Vector Bank, Toxic+IF) in the appendices.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Both safety (harmful response rate) and three capability metrics (GSM8K, IFEval, XSTest) are evaluated for all intervention conditions.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Appendix A.9 reports human annotation of 500 randomly sampled model responses to validate the LLM judge, achieving 90.6% agreement (Cohen's κ=0.81).",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Evaluation uses 120 LMSys prompts held out from the 150 prompts used to construct the probing vector, providing a separate test set for reporting results.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results are broken down by ranking method, intervention size, model size (7B vs. 32B), and source model (Table 6 shows per-source-model attribution fractions).",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Section 7 discusses that LESS degrades at larger intervention sizes, that label switching introduces capability tradeoffs at 30k datapoints, and that LLM Toxic+IF underperforms LLM Toxic alone.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Negative results include gradient-based LESS degrading at scale (5.78% vs 2.86% at 30k filtering), label switching causing GSM8K degradation from 72.5% to ~68% at large scale, and the Toxic+IF variant underperforming.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "OLMo 2 7B and 32B are specified; GPT-5-mini (OpenAI, 2025) and Grok 4.1 (xAI, 2025) are named with dated references.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Appendix A.7 provides the exact LLM Toxic and Toxic+IF judge prompts verbatim; Appendix A.8 provides the full evaluation judge prompt.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Key hyperparameters are reported: layer 20 for probing vector, steering scale α=2.0, LoRA rank 128 with α=512, 8192-dimension projections for LESS, 100 rollouts per prompt, medium reasoning effort for GPT-5-mini.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "This is a training data attribution paper with no agentic scaffolding involved.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Appendix A.1 documents the four-criterion filter for selecting the 150 probing prompts (SFT without/with distractor: 0 toxicity; DPO without distractor: 0 toxicity; DPO with distractor: non-zero toxicity).",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "The visualization tool and analysis artifacts are promised 'upon publication' but not currently released; while the underlying OLMo 2 training data is public, the probe prompts, evaluation sets, and ranking outputs are not independently available.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Section 5.1 describes OLMo 2's DPO dataset construction (responses sampled from 20 LLMs, graded by GPT-4o rubric, highest scoring = accepted); test prompt sampling from LMSys is also described.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participant recruitment—the data is model-generated preference pairs from a standard public dataset.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The full pipeline from OLMo 2 DPO training data → probing vector construction → datapoint ranking → retraining intervention is documented across Sections 3–6 and Appendices A.1–A.5.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "The paper studies data attribution in post-training, not model capability evaluation on benchmarks; training cutoff is not relevant to the primary claims.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": false,
    298           "answer": false,
    299           "justification": "NA—the paper is not evaluating pre-trained model capabilities on held-out benchmarks as a primary contribution.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": false,
    304           "answer": false,
    305           "justification": "Capability benchmarks (GSM8K, IFEval, XSTest) are used only to verify no capability degradation from interventions, not as primary evaluation targets; contamination is not relevant in this context.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants in the experiment proper.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants in the experiment proper.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants in the experiment proper.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants in the experiment proper.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants in the experiment proper.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants in the experiment proper.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants in the experiment proper.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": true,
    357           "justification": "Table 1 provides explicit cost breakdown: probing vector 12 H100 hours ($30), gradient LESS 128 H100 hours ($320), LLM Judge $500; actual 4090 runtime also noted (36 hours, ~$10).",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": true,
    363           "justification": "Table 1 states total compute for ranking all 378k datapoints per method; the paper explicitly positions cost efficiency as a key advantage.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Filtering top-ranked datapoints using the probing vector method reduces distractor-triggered harmful compliance by 63% (from 7.63% to 2.86%)",
    372       "evidence": "Table 2: probing vector filtering at 30,000 datapoints yields 2.86±0.23% vs baseline 7.63±0.36%; causal validation by retraining from SFT checkpoint with modified dataset",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Switching labels of top-ranked datapoints achieves 78% reduction in harmful behavior (from 7.63% to 1.66%)",
    377       "evidence": "Table 2: probing vector switching at 30,000 datapoints yields 1.66±0.19%; verified through full DPO retrain",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Activation-based attribution outperforms gradient-based (LESS) and LLM-judge methods at scale while being 10× cheaper",
    382       "evidence": "Table 2 at 30k filtering: probing vector 2.86% vs LESS 5.78% vs LLM Toxic 3.61%; Table 1: $30 vs $320 vs $500",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Unsupervised clustering of behavior-datapoint similarity matrices can discover emergent harmful behaviors without prior specification",
    387       "evidence": "Figure 3 shows four distinct behavioral clusters; Appendix C.1 independently verifies each cluster; C.2 shows reproducibility across 8 independent random samples",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Distractor-triggered compliance emerged organically from contaminated preference data in OLMo 2's production DPO training, not deliberate injection",
    392       "evidence": "Section 5.2–5.4 demonstrates the behavior; green cluster in Figure 3 identifies mislabeled datapoints where accepted responses comply with harmful requests; contamination validated by finding specific source models over-represented in top-ranked harmful datapoints",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Removing contributions from problematic source models (top 4 over-represented) achieves 85% harm reduction",
    397       "evidence": "Table 4: Max over Vector Bank model-level ablation yields 1.17±0.19% harmful rate vs 7.63% baseline, while preserving GSM8K at 74.2%",
    398       "supported": "strong"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "case-study",
    403     "benchmark-eval",
    404     "observational"
    405   ],
    406   "key_findings": "Activation-based data attribution, which computes cosine similarity between behavior-change vectors and training datapoint vectors in residual activation space, can identify and causally validate which DPO training examples produce harmful emergent behaviors. Applied to OLMo 2, the method discovered distractor-triggered compliance—a behavior where benign formatting instructions appended to harmful requests (e.g., 'no more than 50 words') cause the DPO-trained model to comply at ~7.6% rate while the SFT model maintains near-zero compliance. Filtering or relabeling 30,000 identified datapoints reduces harmful behavior by 63–78% while preserving mathematical reasoning and instruction-following capabilities. The method outperforms gradient-based (LESS) and LLM-judge baselines at scale while being 10× cheaper, and its unsupervised discovery component reliably surfaces behavioral clusters without prior specification of what harmful patterns to seek.",
    407   "red_flags": [
    408     {
    409       "flag": "Single model family",
    410       "detail": "All causal attribution experiments are on OLMo 2 only; OLMo 3 generalization shows unsupervised discovery works but attribution and mitigation pipeline is not validated on any other model family."
    411     },
    412     {
    413       "flag": "Undisclosed financial conflict",
    414       "detail": "Santiago Aranguri is affiliated with Goodfire, a mechanistic interpretability startup whose commercial value depends on activation-based methods being proven superior; no competing interests statement is present."
    415     },
    416     {
    417       "flag": "Probing vector requires known behavior",
    418       "detail": "Constructing the probing vector requires 150 prompts that already exhibit the target behavior, partially undermining the 'unsupervised' framing—the discovery phase is unsupervised but targeted attribution requires behavior awareness."
    419     },
    420     {
    421       "flag": "Single harmful behavior validated causally",
    422       "detail": "Only distractor-triggered compliance undergoes full causal validation (retraining); other discovered behaviors (formatting, verbosity, OLMo 3 roleplay) are verified empirically but not through the full attribution-and-retrain pipeline."
    423     },
    424     {
    425       "flag": "No formal significance tests",
    426       "detail": "Comparative performance claims between methods rely on non-overlapping bootstrap CIs without formal hypothesis tests; at small intervention sizes (3k) several results show overlapping CIs making superiority claims weaker."
    427     },
    428     {
    429       "flag": "Sample size unjustified",
    430       "detail": "120 test prompts with 100 rollouts each is not statistically justified; no power analysis for detecting the observed effect sizes."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "LESS: Selecting Influential Data for Targeted Instruction Tuning",
    436       "relevance": "Primary baseline for gradient-based data attribution; directly compared throughout all experiments"
    437     },
    438     {
    439       "title": "Narrow Finetuning Leaves Clearly Readable Traces in Activation Differences (Minder et al., 2025)",
    440       "relevance": "Core motivation for the paper—critiques artificially-constructed model organisms and directly addressed by the in-the-wild model organism contribution"
    441     },
    442     {
    443       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    444       "relevance": "Post-training method whose training data is audited throughout the paper"
    445     },
    446     {
    447       "title": "The Linear Representation Hypothesis and the Geometry of Large Language Models (Park et al., 2024)",
    448       "relevance": "Theoretical foundation for why cosine similarity in activation space captures behavioral alignment"
    449     },
    450     {
    451       "title": "Persona Vectors: Monitoring and Controlling Character Traits in Language Models (Chen et al., 2025)",
    452       "relevance": "Closest related work on activation-space behavioral analysis; distinguished by targeting pre-specified vs. unknown behaviors"
    453     },
    454     {
    455       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training (Hubinger et al., 2024)",
    456       "relevance": "Exemplar of artificially-constructed model organisms that the in-the-wild approach is designed to complement"
    457     },
    458     {
    459       "title": "TRAK: Attributing Model Behavior at Scale (Park et al., 2023)",
    460       "relevance": "Alternative data attribution method reviewed in related work"
    461     },
    462     {
    463       "title": "Understanding Black-Box Predictions via Influence Functions (Koh & Liang, 2017)",
    464       "relevance": "Foundational data attribution approach that the paper's activation-based method is positioned against"
    465     },
    466     {
    467       "title": "OLMo 2: The Model Studied (OLMo Team, 2025)",
    468       "relevance": "The production model whose DPO training is audited; uniquely suitable because it releases training data and intermediate checkpoints"
    469     },
    470     {
    471       "title": "Representation Engineering: A Top-Down Approach to AI Transparency (Zou et al., 2023)",
    472       "relevance": "Foundational mechanistic interpretability work supporting the linear representation assumption underlying the method"
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 3,
    478       "justification": "Directly applicable to any organization running DPO post-training: a $30 safety audit method that can find and fix harmful data contamination before deployment."
    479     },
    480     "surprise_contrarian": {
    481       "score": 2,
    482       "justification": "The finding that appending benign formatting instructions like 'no more than 50 words' bypasses safety training is counterintuitive and challenges assumptions about DPO robustness."
    483     },
    484     "fear_safety": {
    485       "score": 3,
    486       "justification": "Demonstrates that production models (OLMo 2) have exploitable safety vulnerabilities that evade standard safety evaluations, triggered by normal user behavior rather than adversarial prompting."
    487     },
    488     "drama_conflict": {
    489       "score": 2,
    490       "justification": "Directly critiques the model organism methodology used widely in AI safety research (Hubinger et al., Perez et al.) and shows production contamination that researchers didn't anticipate."
    491     },
    492     "demo_ability": {
    493       "score": 2,
    494       "justification": "OLMo 2 is publicly available so the distractor-triggered compliance behavior can be reproduced, but the full attribution pipeline requires substantial compute and access to training checkpoints."
    495     },
    496     "brand_recognition": {
    497       "score": 1,
    498       "justification": "Caltech and Goodfire (niche mechanistic interpretability startup) are not major AI lab brands; paper evaluates AllenAI's OLMo rather than a flagship commercial model."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [],
    503     "top_points": 0,
    504     "total_points": 0,
    505     "total_comments": 0
    506   }
    507 }

Impressum · Datenschutz