scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30645B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "How Does Controllability Emerge In Language Models During Pretraining?",
      6     "authors": [
      7       "Jianshu She",
      8       "Xinyue Li",
      9       "Eric Xing",
     10       "Zhengzhong Liu",
     11       "Qirong Ho"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2508.01892",
     16     "doi": "10.48550/arXiv.2508.01892"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims steerability emerges at intermediate stages and that related concepts emerge at distinct stages — both are supported by Figure 1 (CrystalCoder emotion experiments) and Figure 2 (commonsense reasoning tasks).",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper frames steerability as 'emerging during training' causally, but the design is purely observational. The authors themselves note in Section 4.2 that 'earlier improvements may stem from pretraining itself rather than the steering effect,' acknowledging the confound is unresolved.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title and conclusion claim findings about 'Language Models' broadly, but experiments cover only two 7B-scale models (CrystalCoder and Amber). The limitations section acknowledges different model sizes were not tested, making the broad framing an overgeneralization.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 4.2 explicitly acknowledges that early accuracy improvements may result from pretraining progress rather than steering, and control groups with different learning rates are used to rule out fine-tuning annealing effects.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Emotion intensity is evaluated via ChatGPT-4 scoring used as a direct proxy for actual emotional expression. Limitation 6 acknowledges LLM-as-judge introduces 'subjectivity and ambiguity,' but main results conflate ChatGPT scores with ground-truth steerability.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 'Limitation' contains 7 numbered specific limitations including model scale constraints, empirically tuned coefficients, exclusion of nonlinear methods, LLM-as-judge subjectivity, and safety concerns with offensive outputs.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are named: only 7B models tested, intervention coefficients vary significantly between models (~10 for Crystal vs ~3 for Amber) without principled explanation, emotion concepts lack ground-truth making steerability onset judgments subjective.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states it does not generalize to larger model sizes, does not explore nonlinear alternatives to linear steering, and that stimulus quality assumptions limit generalizability of emotion results.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No acknowledgment section or funding disclosure appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated on the title page: Mohamed bin Zayed University of Artificial Intelligence (Abu Dhabi, UAE) and Carnegie Mellon University.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed so independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial disclosure appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are explicitly defined: 'linear steerability' is defined as 'the ability to adjust output via linear transformations of hidden states,' and the Intervention Detector (ID) is introduced and formalized with mathematical notation in Section 3 and Table 9.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 lists four explicit contributions: demonstrating steerability emerges in later training stages, showing concept-specific emergence timing, linking emergence to linear separability, and introducing the ID framework.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 1 provides a structured comparison of five related activation intervention methods (RepE, CAA, ITI, CCS, ActAdd). The paper explicitly positions itself as the first longitudinal study of steerability across the training lifecycle, contrasting with prior work restricted to fully trained models.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository is linked or mentioned in the paper. The Intervention Detector framework code is not publicly available.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The ChatGPT-generated emotion stimulus sets (1500 scenarios) central to the unsupervised task are not released. Standard benchmarks (ARC, OBQA, RACE) are public but the novel emotion dataset is the core experimental contribution.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Fine-tuning hyperparameters are listed in Table 5 but no environment specification (requirements.txt, Docker, CUDA version, GPU hardware) is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Methodology is described in Section 3 but no step-by-step reproduction instructions exist. Code is absent, and stimulus construction via ChatGPT-4 would require additional undocumented effort to replicate.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Main results in Figures 1 and 2 do not show error bars or confidence intervals. Figures 6 and 11 show multiple seeds as supplementary material but core quantitative claims lack uncertainty estimates.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied anywhere in the paper. All conclusions are drawn from visual inspection of trend curves and heatmaps.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No formal effect sizes are reported. Accuracy changes are presented as raw percentages in figures without baseline-contextualized effect size calculations or confidence intervals.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The use of 256 stimulus pairs per experiment is not justified with power analysis or sample size rationale; this number is stated without explanation.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Variance across runs is only partially shown in supplementary figures (Figure 6 uses 3 seeds, Figure 11 shows mean across seeds) but main quantitative results in Table 10 and Figure 2 do not report variance or standard deviation.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Figure 2 compares intervened model accuracy against a no-intervention baseline across all checkpoints and all four reasoning datasets.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The no-intervention baseline is appropriate for the research question (whether intervention improves performance at each checkpoint stage rather than comparison against other methods).",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Appendix E.2 ablates number of injected layers and scaling factor; Figure 12 ablates token position selection; Appendix L compares PCA vs K-Means; Table 6 ablates base vs fine-tuned model; Appendix M tests different learning rates.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are used: ID score heatmaps, cosine similarity of representation vectors, entropy trends, layer-wise ID score differences, dataset accuracy, and ChatGPT-assigned emotion intensity scores.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Emotion intensity is evaluated entirely by ChatGPT-4, not human annotators. The paper uses LLM-as-judge rather than human evaluation for the unsupervised emotion task.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The stimulus set is split into training (Strain) for computing concept representations and test (Stest) for computing ID scores, as described in Section 3.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per emotion (6 emotions in Figures 1b and 4) and per dataset (4 reasoning datasets in Figures 2 and 7).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Early-stage interventions reducing accuracy below baseline are shown in Figure 2, surprise/disgust show weaker or inconclusive steerability, and Appendix E.2 shows high scaling factors producing incoherent or offensive outputs.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that early-stage interventions reduce accuracy below baseline, happiness steering at lower coefficients fails to produce emotional content, and some emotions remain inconclusive throughout training.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "LLM360/CrystalCoder and Amber models are specified with full architecture details in Tables 2 and 8 respectively, and checkpoints are described as saved every 15K steps.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Stimulus templates with exact format are provided in Appendix A and B, including positive/negative prompt templates with concrete examples, and the full ChatGPT evaluation prompt is shown with filled values.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 5 reports fine-tuning parameters (iterations, sequence length, batch size, learning rate, BF16 precision). Intervention parameters (scaling factor 40, top 10 layers for all experiments) are specified in Section 3.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; this paper studies direct activation manipulation during standard LLM inference.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix A describes stimulus construction (ChatGPT-4 generates 1500 scenarios, 256 randomly selected per run, split into train/test). Appendix B describes supervised task stimulus construction using correct/incorrect answer pairs from benchmark datasets.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Neither the emotion stimulus sets nor the extracted hidden states are publicly available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": false,
    276           "justification": "The generation process is described (ChatGPT-4 prompted to generate scenarios) but the exact prompts used to generate the 1500 emotion scenarios from ChatGPT-4 are not provided.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; standard benchmark datasets and LLM-generated stimuli are used.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Section 3 provides a detailed 4-step pipeline: hidden state collection, linear decomposition (PCA/K-means), ID score calculation, and intervention. Figure 3 provides a visual overview of all steps.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "ARC Challenge/Easy, OBQA, and RACE are used as evaluation benchmarks but the paper does not state CrystalCoder's training data cutoff or whether these benchmarks appeared in the pretraining corpus.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Potential overlap between ARC/OBQA/RACE and CrystalCoder's pretraining data is never discussed, a significant omission since the study tracks accuracy across pretraining checkpoints where contamination would confound interpretation.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "No discussion of benchmark contamination despite the fact that accuracy changes on ARC/OBQA/RACE during pretraining could reflect memorization of contaminated training data rather than emergent reasoning.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or throughput figures are reported for running the Intervention Detector framework across checkpoints.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No compute budget (GPU hours, hardware specifications, or total cost) is reported despite fine-tuning all pretraining checkpoints multiple times.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Linear steerability emerges abruptly at a specific intermediate training stage (~68% of pretraining steps for anger) rather than gradually throughout training",
    375       "evidence": "Figure 1a shows a sharp increase in ChatGPT-scored emotion intensity for anger interventions only after ~68% of training steps on CrystalCoder, with no notable effect prior to that checkpoint",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Closely related emotional concepts (e.g., anger vs. sadness) emerge steerability at distinct training stages",
    380       "evidence": "Figure 1b shows anger and fear emerge at earlier checkpoints while sadness, surprise, and disgust emerge later with weaker control outcomes; supported visually but without statistical testing",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "A model can represent and express a concept before it becomes linearly steerable, indicating steerability is a distinct capability",
    385       "evidence": "Figure 1a caption notes 'the model demonstrates the ability to express anger earlier than it develops linear steerability over it,' but the methodology for measuring expression ability is less rigorously defined than for measuring steerability",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Linear separability in hidden state space is the structural precondition for effective linear steering, and concepts become increasingly linearly separable as training progresses",
    390       "evidence": "ID score heatmaps (Figure 4) show high-layer ID scores above 0.8 emerging at the same checkpoint where steerability appears; cosine similarity drops coincide with emergence events (Figure 5); PCA first component dominance increases over training (Figure 10)",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "The Intervention Detector framework generalizes across different model families (Crystal and Amber)",
    395       "evidence": "Figure 13 shows similar heatmap patterns for Amber as for CrystalCoder; Table 10 shows ID score spike checkpoints align with effective intervention checkpoints across 4 datasets, but only two 7B-scale models are tested",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "Layer-wise ID score differences (spikes) predict the checkpoint at which steerability emergence occurs",
    400       "evidence": "Table 10 shows alignment between 'biggest spike' checkpoint and 'effective intervention' checkpoint across RACE (93%/90%), OBQA (63%/65%), ARC-C (99%/99%), ARC-E (100%/98%)",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "observational",
    406     "benchmark-eval"
    407   ],
    408   "key_findings": "Linear steerability in language models does not emerge uniformly during pretraining but appears abruptly at specific training stages that vary by concept — anger and fear emerge earlier than sadness, surprise, or disgust. The Intervention Detector (ID) framework tracks this emergence through linear separability metrics on hidden state representations and can roughly predict when steering interventions will become effective. A key counter-intuitive finding is that a model can express a concept via prompting before that concept becomes linearly steerable, indicating steerability is a distinct structural capability requiring not just concept representation but linear separability. These patterns replicate across two 7B-scale open-source models (CrystalCoder and Amber) but have not been tested at other scales.",
    409   "red_flags": [
    410     {
    411       "flag": "LLM-as-judge evaluation",
    412       "detail": "Emotion intensity in the unsupervised task is evaluated entirely by ChatGPT-4, introducing circular reasoning and subjectivity. No human validation of emotion scores was conducted, and the ChatGPT scoring prompt anchors on the final CrystalChat model output as the '10' reference, which introduces additional bias."
    413     },
    414     {
    415       "flag": "No statistical testing",
    416       "detail": "All conclusions are drawn from visual inspection of heatmaps and trend curves. No significance tests, confidence intervals, or formal effect sizes are reported for any main claim."
    417     },
    418     {
    419       "flag": "Benchmark contamination unaddressed",
    420       "detail": "ARC, OBQA, and RACE benchmarks are used to measure reasoning steerability during pretraining, but no discussion of whether these appeared in CrystalCoder's pretraining corpus. Contamination would directly confound interpretation of accuracy changes across checkpoints."
    421     },
    422     {
    423       "flag": "Overgeneralized title and scope claims",
    424       "detail": "The paper title and conclusion claim findings about 'Language Models' broadly and 'general patterns,' but experiments cover only two 7B-scale models from the same open-source ecosystem (LLM360 Crystal and Amber) with similar architectures."
    425     },
    426     {
    427       "flag": "Empirically tuned hyperparameters",
    428       "detail": "Scaling factor for interventions varies dramatically between models (~40 for Crystal, ~3 for Amber) and was tuned through trial-and-error, which could bias comparisons of steerability timing across models and makes results hard to replicate."
    429     },
    430     {
    431       "flag": "No code or custom data release",
    432       "detail": "Neither the ID framework code nor the ChatGPT-generated emotion stimulus datasets are released, preventing reproduction of the core experimental setup despite the paper framing ID as a broadly applicable 'cost-effective monitoring tool.'"
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Representation Engineering: A Top-Down Approach to AI Transparency",
    438       "relevance": "Direct predecessor; RepE is one of the activation engineering methods adapted into the Intervention Detector framework"
    439     },
    440     {
    441       "title": "Steering Llama 2 via Contrastive Activation Addition (CAA)",
    442       "relevance": "Key related method; paper borrows CAA's contrastive stimulus approach and directly compares token-level vs sentence-level stimuli in Appendix O"
    443     },
    444     {
    445       "title": "Inference-Time Intervention: Eliciting Truthful Answers from a Language Model",
    446       "relevance": "Component method integrated into ID framework; uses sentence-level stimuli for truthfulness steering"
    447     },
    448     {
    449       "title": "Discovering Latent Knowledge in Language Models Without Supervision (CCS)",
    450       "relevance": "Unsupervised contrastive linear probing method incorporated into the ID framework comparison"
    451     },
    452     {
    453       "title": "LLM360: Towards Fully Transparent Open-Source LLMs",
    454       "relevance": "Source of the CrystalCoder and Amber pretraining checkpoints that enable the entire longitudinal study"
    455     },
    456     {
    457       "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling",
    458       "relevance": "One of the few models providing open intermediate checkpoints; motivates the study's checkpoint-based analysis approach"
    459     },
    460     {
    461       "title": "Emergent Abilities of Large Language Models",
    462       "relevance": "Conceptual framing for emergence; paper positions steerability emergence as analogous to emergent abilities in Wei et al."
    463     },
    464     {
    465       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    466       "relevance": "Cited in context of emergence debate; relevant to interpreting whether steerability emergence is a real phenomenon or a measurement artifact"
    467     },
    468     {
    469       "title": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet",
    470       "relevance": "Sparse autoencoder approach for concept extraction contrasted with the paper's training-free linear approach"
    471     },
    472     {
    473       "title": "Towards Tracing Trustworthiness Dynamics: Revisiting Pre-training Period of Large Language Models",
    474       "relevance": "Direct competitor also studying model properties across pretraining checkpoints; cited as related longitudinal analysis"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 2,
    480       "justification": "Identifies when during training activation steering becomes effective, which could guide training curriculum design and early stopping decisions for controllability applications."
    481     },
    482     "surprise_contrarian": {
    483       "score": 2,
    484       "justification": "Counter-intuitive finding that a model can express a concept via prompting before it can be linearly steered challenges the assumption that concept representation implies steerable control."
    485     },
    486     "fear_safety": {
    487       "score": 1,
    488       "justification": "Has indirect safety relevance for AI alignment and controllability but does not raise acute safety concerns; paper notes it generates offensive outputs as a side effect without mitigation."
    489     },
    490     "drama_conflict": {
    491       "score": 1,
    492       "justification": "Minor engagement with the emergent abilities debate (Wei et al. vs Schaeffer et al.) but no direct controversy with other research groups."
    493     },
    494     "demo_ability": {
    495       "score": 1,
    496       "justification": "Requires access to open LLM360 checkpoints and unreleased code; not easily demonstrable by practitioners without significant infrastructure setup."
    497     },
    498     "brand_recognition": {
    499       "score": 1,
    500       "justification": "Mohamed bin Zayed University of AI and CMU are respectable institutions but the paper lacks affiliation with top-tier LLM labs (OpenAI, DeepMind, Google, Meta)."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "36979886",
    507         "title": "Observation of zero resistance above 100 K in Pb₁₀₋ₓCuₓ(PO₄)₆O",
    508         "points": 779,
    509         "comments": 372,
    510         "url": "https://news.ycombinator.com/item?id=36979886",
    511         "created_at": "2023-08-03T00:43:14Z"
    512       },
    513       {
    514         "hn_id": "45553577",
    515         "title": "Meta Superintelligence Labs' first paper is about RAG",
    516         "points": 423,
    517         "comments": 271,
    518         "url": "https://news.ycombinator.com/item?id=45553577",
    519         "created_at": "2025-10-11T23:16:05Z"
    520       },
    521       {
    522         "hn_id": "44502527",
    523         "title": "Dynamical origin of Theia, the last giant impactor on Earth",
    524         "points": 96,
    525         "comments": 46,
    526         "url": "https://news.ycombinator.com/item?id=44502527",
    527         "created_at": "2025-07-08T18:10:46Z"
    528       },
    529       {
    530         "hn_id": "43374283",
    531         "title": "AutoHete: An Automatic and Efficient Heterogeneous Training System for LLMs",
    532         "points": 44,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=43374283",
    535         "created_at": "2025-03-15T18:22:37Z"
    536       },
    537       {
    538         "hn_id": "41262642",
    539         "title": "AI and the value of privacy-preserving tools to distinguish who is real online",
    540         "points": 5,
    541         "comments": 1,
    542         "url": "https://news.ycombinator.com/item?id=41262642",
    543         "created_at": "2024-08-16T02:37:29Z"
    544       },
    545       {
    546         "hn_id": "45213397",
    547         "title": "Refrag: Rethinking RAG Based Decoding",
    548         "points": 4,
    549         "comments": 1,
    550         "url": "https://news.ycombinator.com/item?id=45213397",
    551         "created_at": "2025-09-11T16:22:46Z"
    552       },
    553       {
    554         "hn_id": "45164490",
    555         "title": "Refrag: Rethinking RAG Based Decoding",
    556         "points": 4,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=45164490",
    559         "created_at": "2025-09-08T03:53:27Z"
    560       },
    561       {
    562         "hn_id": "42871314",
    563         "title": "Is Your Image a Good Storyteller?",
    564         "points": 4,
    565         "comments": 0,
    566         "url": "https://news.ycombinator.com/item?id=42871314",
    567         "created_at": "2025-01-29T21:20:31Z"
    568       },
    569       {
    570         "hn_id": "41274485",
    571         "title": "AI and the value of privacy-preserving tools to distinguish who is real online",
    572         "points": 4,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=41274485",
    575         "created_at": "2024-08-17T13:35:48Z"
    576       },
    577       {
    578         "hn_id": "47286294",
    579         "title": "MLP Memory: A Retriever-Pretrained Memory for Large Language Models",
    580         "points": 1,
    581         "comments": 0,
    582         "url": "https://news.ycombinator.com/item?id=47286294",
    583         "created_at": "2026-03-07T10:26:16Z"
    584       }
    585     ],
    586     "top_points": 779,
    587     "total_points": 1364,
    588     "total_comments": 691
    589   }
    590 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs