ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (27347B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Layer of Truth: Probing Belief Shifts under Continual Pre-Training Poisoning",
      6     "authors": [
      7       "S. Churina",
      8       "Niranjan Chebrolu",
      9       "Kokil Jaidka"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2510.26829",
     14     "doi": "10.48550/arXiv.2510.26829"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All five major abstract claims (≥55% flip at moderate poisoning, late-layer concentration, 56.8% patching reversibility, selective commonsense degradation, alignment stability) are directly supported by experimental results in Sections 6 and Appendix 9.5.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The controlled CPT setup (varying poison ratios ρ, non-poisoned ablation in Appendix 9.6, activation patching) provides adequate support for causal claims about poisoning inducing belief flips; the non-poisoned CPT control confirms effects are from poison rather than the CPT regime itself.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The limitations section (Section 8) explicitly bounds generalization to 52 factual entities, Qwen2.5 models up to 7B, and English/Russian only; the paper does not claim findings extend to frontier-scale models.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper considers whether degradation stems from CPT itself vs. poisoned data (Appendix 9.6), whether corruption is diffuse uncertainty vs. targeted replacement (Section 6.2), and whether backward reasoning coherence reflects internal representation change vs. surface association (Section 6.4).",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly distinguishes log-likelihood preference (internal 'belief') from external output accuracy and acknowledges the operational definition of belief as a learned preference between competing alternatives, not cognition in the human sense.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 8 contains a dedicated 'Limitations' paragraph covering dataset scope, model scale ceiling, and cross-lingual evaluation bounds.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are named: only 52 entities due to compute constraints (not statistical power), experiments limited to ≤7B parameters, and cross-lingual evaluation only for English→Russian with acknowledged mechanism uncertainty.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states it does not cover 'the full diversity of factual knowledge,' does not evaluate larger frontier models, and cross-lingual conclusions are limited to the English-poisoned/Russian-evaluated setting.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment or funding source appears anywhere in the paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are identified as affiliated with the Centre for Trusted Internet & Community, National University of Singapore.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funder is disclosed, so independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "'Belief' is formally defined as a learned preference between competing factual alternatives (Section 3.1); 'poison ratio,' 'continual pre-training,' and 'belief shift' are all operationally defined before use.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 explicitly enumerates four contributions: (1) a controlled CPT poisoning setting, (2) a representation-level measurement stack, (3) evidence of discrete belief replacement, and (4) evidence of structured generalization.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 positions the work relative to factual hallucination research, data poisoning/backdoor attacks, continual pre-training literature, and mechanistic interpretability, explicitly noting the gap (longitudinal representation analysis during CPT) this paper fills.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No code repository or release is mentioned anywhere in the paper.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The custom dataset D (212 unique entities, 147,884 instances) was constructed for this paper and is not publicly released; no download link or repository is provided.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Appendix 9.2 states 'single GPU using bfloat16' but does not specify GPU model, CUDA version, Python version, or provide any requirements file.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Appendix 9.2 lists hyperparameters but provides no runnable code, no step-by-step commands, and no data download path; reproduction would require significant guesswork.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Table 8 reports Mean ± Std (Min–Max) across checkpoints for all OOD conditions, and Figure 9 shows forest plots with 95% confidence intervals.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Appendix 9.5 reports one-sample t-tests with Bonferroni correction (α = 0.05/32 = 0.00156) across all 32 OOD comparisons, with explicit t-statistics and p-values in Table 10.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Cohen's d is reported for all conditions in Table 9, with explicit interpretation thresholds (negligible < 0.2, small 0.2–0.5, medium 0.5–0.8, large ≥0.8).",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 52-entity CPT subset is justified only by 'computational constraints,' not by statistical power analysis or effect size estimation.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Table 8 reports standard deviation across checkpoints for all OOD conditions; logit lens analyses report σ ≈ 3–6 across questions.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Non-poisoned baseline models (ρ=0) are used throughout, and Appendix 9.6 provides an explicit non-poisoned CPT control trained for the same duration.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The baseline is the same Qwen2.5 model without poisoning, which is the correct comparison for measuring poisoning effects; OOD benchmarks (HellaSwag, TruthfulQA) are standard contemporary evaluations.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The paper ablates poison ratios (0.1, 0.5, 0.9, 1.0), model scales (0.5B, 1.5B, 3B, 7B), mechanistic probes (single-layer vs. window patching, attention head ablation), and prompt formats.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Evaluation uses log-likelihood differences, external output accuracy (Correct/Poisoned/Ambiguous), four OOD benchmarks, Garak robustness probes, CKA similarity, and patching rescue rates.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "The paper evaluates internal model representations and automatic benchmark scores; human evaluation of outputs is not applicable to this mechanistic study.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "The primary belief evaluation (belief flip rates, logit lens) is performed on the same 52 entities used for CPT poisoning—there is no held-out test set for the core factual belief claims; OOD benchmarks are held-out only for the secondary generalization analysis.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 5 reports per-question susceptibility across model scales and poison ratios; results are broken down by domain (general knowledge, mathematics, chemistry, translation) and model size.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Tables 6 and 7 provide detailed qualitative failure cases from backward reasoning (Stage 1: coherent false justifications; Stage 2: repetitive degeneration), and Table 11 shows per-format failure modes.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "At ρ=0.1 only 8% of answers were poisoned; alignment benchmarks (TruthfulQA, HH-RLHF) mostly showed non-significant effects; window patching did not consistently outperform single-layer patching.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Models are described as 'Qwen2.5 models (0.5B, 1.5B, 3B, and 7B parameters) available on Hugging Face' without specifying exact checkpoint identifiers, commit hashes, or snapshot dates.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Table 2 provides all 10 prompt format templates with full example text; Table 11 shows specific prompt–response pairs; the backward reasoning prompt template is given verbatim in Section 9.4.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Appendix 9.2 reports: batch size 4, max sequence length 256, learning rate 1e-4, cosine LR scheduler, 200 warmup steps, AdamW optimizer, bfloat16 precision, and 12,000 training steps.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used; this is a standard CPT and probing study.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 5 describes the full pipeline: source (General Knowledge Norms), filtering criteria, GPT-5-based counterfactual generation with manual validation, surface form expansion across 5 styles and 10 prompt formats, and stratified subsetting strategy.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "The custom dataset D (212 entities, 147,884 instances) is not released publicly; no repository, dataset card, or download link is provided.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 5 describes the source (General Knowledge Norms), filtering of vague items, domain extension to mathematics/chemistry/translation, GPT-5 counterfactual generation, and manual validation of all ground-truth answers (Table 4).",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants were recruited; the dataset is constructed from existing norms and generated text.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The pipeline from General Knowledge Norms → filtering → GPT-5 counterfactual generation → manual validation → stylistic expansion → stratified subsetting for CPT is described end-to-end in Section 5.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The Qwen2.5 training data cutoff is never stated; the paper does not specify whether the General Knowledge Norms facts were in Qwen2.5's pretraining corpus.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The paper notes that chemistry facts may be 'unevenly represented in LLM training data' as a motivation for inclusion but does not systematically address overlap between the fact set and Qwen2.5's training data.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "HellaSwag, TruthfulQA, HH-RLHF, and BBEH Logic are evaluated without discussion of whether these benchmarks appeared in Qwen2.5's training data; the comparative design (poisoned vs. baseline) partially mitigates this but it is not acknowledged.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or GPU hours are reported; only 'single GPU using bfloat16' is mentioned for training.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "The paper states training was on 'a single GPU' but provides no GPU model, total compute hours, or FLOPs estimate.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Continual pre-training on plausible misinformation can overwrite specific factual knowledge in LLMs without degrading overall performance on alignment benchmarks.",
    373       "evidence": "At ρ≥0.5, poisoned preference rates exceed 55% across all model scales, while TruthfulQA and HH-RLHF remain largely stable (6 of 8 conditions non-significant in Table 10).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Belief corruption emerges abruptly across training checkpoints rather than accumulating smoothly.",
    378       "evidence": "Figures 1b and 4a show extended plateaus of stable preference followed by rapid transitions; the step-like pattern is consistent across model scales and poison ratios.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Belief corruption concentrates in late transformer layers (Layers 25–36 in 3B models) rather than being uniformly distributed.",
    383       "evidence": "Logit lens analysis over 52 questions shows systematic late-layer collapse for ρ≥50%, peaking at mean ≈−7.3 at Layer 36 for 100% poison; supported by converging evidence from head ablation and activation patching.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Corrupted beliefs are partially reversible via activation patching, with effects concentrated in late layers (up to 56.8% rescue rate).",
    388       "evidence": "Single-layer activation patching (Section 6.3) restores correct belief in 33.3% of cases for single layers; window patching achieves up to 56.8%; interventions at earlier layers produce minimal rescue.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Poisoning selectively degrades commonsense reasoning (HellaSwag) while leaving alignment metrics largely intact.",
    393       "evidence": "Cohen's d < −3.0 for all 8 HellaSwag conditions (p<0.001), while TruthfulQA is non-significant in 6 of 8 conditions and HH-RLHF is non-significant at highest poison ratios (Table 9–10).",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Poisoning paradoxically improves formal logic performance in 7B but not 3B models.",
    398       "evidence": "BBEH Logic Cohen's d ranges from +2.19 to +14.28 for 7B models (all significant, p<0.001), while 3B models show decline (d=−0.98 to −2.23); no mechanistic explanation is provided.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Poisoned beliefs generalize beyond training realizations to new prompt formats and across languages.",
    403       "evidence": "Belief flips are consistent across 10 prompt formats (Table 11); Russian cross-lingual evaluation shows decreased correct responses, though with higher ambiguity than English rather than clean substitution of the poisoned alternative.",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval",
    409     "observational"
    410   ],
    411   "key_findings": "Continual pre-training on paired fact–counterfact corpora systematically replaces factual beliefs in Qwen2.5 LLMs rather than increasing uncertainty: at ≥50% poison ratios, over 55% of responses shift to the counterfactual alternative while ambiguity stays stable. Belief corruption localizes sharply to late transformer layers (25–36 in 3B models), emerges abruptly across checkpoints, and is only partially reversible via activation patching (up to 56.8%), with strongly corrupted beliefs becoming increasingly resistant to localized rescue. These corrupted beliefs generalize across prompt formats and languages, selectively degrade commonsense reasoning (large negative Cohen's d on HellaSwag), and leave alignment benchmarks largely unaffected—exposing a failure mode in which targeted misinformation evades standard performance-based detection while rewriting internal factual representations.",
    412   "red_flags": [
    413     {
    414       "flag": "Test set equals training set",
    415       "detail": "The primary belief flip evaluation is performed on the same 52 entities used for CPT poisoning—there is no held-out factual test set; generalization is tested only via surface-form variation and OOD benchmarks."
    416     },
    417     {
    418       "flag": "52-entity CPT corpus too small",
    419       "detail": "Only 52 unique entities (from 212) are used for CPT due to 'computational constraints,' justified without power analysis. Whether findings generalize to more diverse or subtle misinformation is untested."
    420     },
    421     {
    422       "flag": "Model versions unspecified",
    423       "detail": "Qwen2.5 models are described as 'available on Hugging Face' without checkpoint IDs or snapshot dates, making exact reproduction impossible."
    424     },
    425     {
    426       "flag": "No code or dataset release",
    427       "detail": "Neither the training/evaluation code nor the custom dataset D (147,884 instances) is released, making independent replication infeasible."
    428     },
    429     {
    430       "flag": "Paradoxical logic improvement unexplained",
    431       "detail": "Formal logic (BBEH Logic) improves in 7B but degrades in 3B models under poisoning—described as 'paradoxical' with no mechanistic explanation proposed, raising questions about interpretive completeness."
    432     },
    433     {
    434       "flag": "GPU and compute budget unspecified",
    435       "detail": "Only 'single GPU using bfloat16' is stated; no GPU model, CUDA version, or compute hours are reported, preventing meaningful cost assessment or reproducibility."
    436     },
    437     {
    438       "flag": "GPT-5 used for counterfactual generation",
    439       "detail": "Counterfactuals were 'generated using GPT-5 and manually validated'—GPT-5 is a non-public or recently released model at submission time, making dataset recreation dependent on access to that system."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "Locating and Editing Factual Associations in GPT",
    445       "relevance": "Core prior work on localizing factual knowledge to specific transformer layers/heads; this paper extends those methods longitudinally across CPT checkpoints."
    446     },
    447     {
    448       "title": "Discovering Latent Knowledge in Language Models Without Supervision",
    449       "relevance": "Prior work on probing internal representations for factual beliefs without relying on behavioral outputs; directly motivates the log-likelihood preference measurement approach."
    450     },
    451     {
    452       "title": "Poisoning Web-Scale Training Datasets is Practical",
    453       "relevance": "Prior poisoning work framing corruption as an external threat in static pre-training; this paper distinguishes its repeated-exposure CPT regime from that paradigm."
    454     },
    455     {
    456       "title": "Sleeper Agents: Training Deceptive LLMs That Persist Through Safety Training",
    457       "relevance": "Shows that certain induced behaviors survive alignment training; complements findings that poisoned factual beliefs survive alignment benchmarks."
    458     },
    459     {
    460       "title": "Persistent Pre-training Poisoning of LLMs",
    461       "relevance": "Most closely related work on poisoning persistence across training phases; this paper extends to the continual/longitudinal setting."
    462     },
    463     {
    464       "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
    465       "relevance": "Key benchmark used for evaluating alignment stability under poisoning; also frames the factual reliability problem this paper addresses."
    466     },
    467     {
    468       "title": "Knowledge Does Not Protect Against Illusory Truth",
    469       "relevance": "Cognitive psychology source for the illusory truth effect analogy—repeated exposure to falsehood increases perceived truth—which motivates the experimental design."
    470     },
    471     {
    472       "title": "Don't Stop Pretraining: Adapt Language Models to Domains and Tasks",
    473       "relevance": "Foundational continual pre-training work establishing the regime this paper studies as a vector for misinformation propagation."
    474     }
    475   ],
    476   "engagement_factors": {
    477     "practical_relevance": {
    478       "score": 2,
    479       "justification": "Organizations doing domain adaptation or model refreshing with CPT face a concrete, detection-evading attack surface; representation-level monitoring is actionable but requires specialized tooling."
    480     },
    481     "surprise_contrarian": {
    482       "score": 3,
    483       "justification": "The finding that belief flips occur without alignment benchmark degradation directly contradicts the assumption that standard evals catch knowledge corruption, and the abrupt step-like dynamics contradict smooth gradient expectations."
    484     },
    485     "fear_safety": {
    486       "score": 3,
    487       "justification": "Demonstrates a stealthy attack mode where targeted misinformation rewrites factual knowledge while evading safety evaluations—directly relevant to model supply chain integrity and deployment-time monitoring."
    488     },
    489     "drama_conflict": {
    490       "score": 1,
    491       "justification": "No direct controversy or competing camps; the work fits within the established mechanistic interpretability and poisoning literature without antagonizing a specific community."
    492     },
    493     "demo_ability": {
    494       "score": 1,
    495       "justification": "Requires CPT infrastructure, custom dataset construction, and logit-lens tooling; no interactive demo or easily runnable artifact is provided."
    496     },
    497     "brand_recognition": {
    498       "score": 1,
    499       "justification": "Authors are from NUS (respectable but not a headline AI lab); Qwen2.5 models used are commercially recognized but not GPT-4/Claude-level brand draw."
    500     }
    501   },
    502   "hn_data": {
    503     "threads": [],
    504     "top_points": 0,
    505     "total_points": 0,
    506     "total_comments": 0
    507   }
    508 }

Impressum · Datenschutz