scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23916B)
      1 {
      2   "paper": {
      3     "title": "The Lock-In Phase Hypothesis: Identity Consolidation as a Precursor to AGI",
      4     "authors": ["Marcelo Maciel Amaral", "Raymond Aschheim"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2510.20190",
      8     "doi": "10.48550/arXiv.2510.20190"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["theoretical", "benchmark-eval"],
     13   "key_findings": "The paper proposes the 'Lock-In Phase Hypothesis' — that progress toward AGI involves a consolidation phase where model identity becomes stable and resistant to steering. Experiments on 4 models (Gemma-2B, Llama-1B/3B/8B) fine-tuned with a 'Cautious Scientist' persona show rapid, non-linear behavioral consolidation (measured via Refusal Elasticity), with capability side-effects varying by model scale: small models pay a performance tax, mid-scale models absorb consolidation cost-free, and large quantized models show transient instabilities.",
     14   "claims": [
     15     {
     16       "claim": "Identity consolidation is rapid and non-linear, resembling a phase transition rather than smooth drift.",
     17       "evidence": "Figure 1 and Table 1 show RE jumping from ~47% to ~64% within ≤20 steps for Gemma-2B, and from ~17% to >80% for Llama-3B. Section 6.1.",
     18       "supported": "moderate"
     19     },
     20     {
     21       "claim": "Capability side-effects of consolidation depend on model capacity and numerical precision.",
     22       "evidence": "Table 1: Gemma-2B shows ΔARC = -0.33pp (negligible), Llama-1B volatile, Llama-3B +4.01pp uplift, Llama-8B ΔARC = 0.00. Section 6.1 overview of findings.",
     23       "supported": "moderate"
     24     },
     25     {
     26       "claim": "Small models pay a performance cost for consolidation while mid-scale models absorb it.",
     27       "evidence": "For Llama-1B, volatile critical period with co-moving ARC and RE. For Gemma-2B, ARC remains flat (SD ≈0.60pp). Section 6.1.",
     28       "supported": "weak"
     29     },
     30     {
     31       "claim": "Large quantized models show transient instabilities during consolidation.",
     32       "evidence": "Llama-8B 4-bit: ARC spikes +12pp, dips, then recovers. Only 5 checkpoints. Section 6.1.",
     33       "supported": "weak"
     34     }
     35   ],
     36   "checklist": {
     37     "artifacts": {
     38       "code_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "GitHub repository URL provided: https://github.com/gaugefreedom/persona-phase-transition. Referenced in Section 6.1 and footnote 1."
     42       },
     43       "data_released": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper states 'full per-checkpoint artifacts are available' at the GitHub repository. Uses public ARC-Challenge benchmark."
     47       },
     48       "environment_specified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No environment specifications, requirements.txt, or dependency details mentioned in the paper. Only mentions '4-bit weight quantization' for 8B model and 'commodity hardware'."
     52       },
     53       "reproduction_instructions": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No step-by-step reproduction instructions in the paper. References GitHub repo but paper itself contains no reproduction guide."
     57       }
     58     },
     59     "statistical_methodology": {
     60       "confidence_intervals_or_error_bars": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No confidence intervals or error bars reported. Table 1 reports point estimates for mean ARC and correlations without uncertainty bounds."
     64       },
     65       "significance_tests": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No formal significance tests. Spearman correlations are reported with p-values (e.g., 'p < 10−3') but the main comparative claims (small vs mid-scale models) have no statistical tests. A 'pre/post nonparametric test' for Gemma ARC is mentioned but deferred to the repository."
     69       },
     70       "effect_sizes_reported": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Effect sizes reported in context: ΔARC values in pp (e.g., -0.33pp, +4.01pp), RE changes (47%→64%), and Spearman ρ values in Table 1."
     74       },
     75       "sample_size_justified": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No justification for why 4 models were chosen, why checkpoint counts vary (5-19), or power analysis. The 8B model has only 5 checkpoints."
     79       },
     80       "variance_reported": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "SD reported only for Gemma-2B ARC (≈0.60pp). No variance across runs reported — appears to be single-run experiments for each model."
     84       }
     85     },
     86     "evaluation_design": {
     87       "baselines_included": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No baseline consolidation methods compared. The paper tracks metrics over fine-tuning checkpoints but does not compare against alternative consolidation approaches or control conditions."
     91       },
     92       "baselines_contemporary": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No baselines included at all."
     96       },
     97       "ablation_study": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No ablation study. The paper varies model scale but does not ablate components of the consolidation procedure (e.g., dataset size, learning rate, persona direction construction)."
    101       },
    102       "multiple_metrics": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three metrics tracked: Refusal Elasticity (RE), persona similarity cosine, and ARC-Challenge accuracy. Table 1 and Figure 1."
    106       },
    107       "human_evaluation": {
    108         "applies": false,
    109         "answer": false,
    110         "justification": "Human evaluation is not relevant to measuring internal representation dynamics and automated behavioral metrics."
    111       },
    112       "held_out_test_set": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "No discussion of held-out test sets. ARC-Challenge is used as-is. The steering prompt suite and persona dataset construction are not described with train/test splits."
    116       },
    117       "per_category_breakdown": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Results broken down per model (4 models) with distinct dynamics described for each. Table 1 and Figure 1 panels a-d."
    121       },
    122       "failure_cases_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Llama-1B described as 'volatile critical period' with collapse and partial recovery. Llama-8B 4-bit shows 'transient instabilities'. Footnote 1 discusses a failed ARC evaluation checkpoint."
    126       },
    127       "negative_results_reported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Reports that small models (Llama-1B) show volatile, unstable consolidation and that quantized 8B models exhibit transient capability instabilities — these are negative findings about consolidation dynamics."
    131       }
    132     },
    133     "claims_and_evidence": {
    134       "abstract_claims_supported": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Abstract claims about 'spectrum of outcomes' across model scales are supported by Table 1 and Figure 1. Claims are appropriately hedged ('our results reveal')."
    138       },
    139       "causal_claims_justified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper makes causal claims ('identity lock-in is rapid', 'its capability side-effects depend on scale and numerical precision') but the design doesn't control for confounds — scale and quantization co-vary for the 8B model, and each model is a single run."
    143       },
    144       "generalization_bounded": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Title claims relevance to 'AGI' but experiments cover only 4 small instruction-tuned models (1B-8B) with a single persona. The paper frames findings as general 'identity consolidation dynamics' far beyond what 4 models can support."
    148       },
    149       "alternative_explanations_discussed": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 9 (Limitations) discusses confounds: quantization stress for 8B, metric artifacts, optimizer/data/architecture dependence. Section 6.1 notes Llama-3B disclaimer rate may inflate RE."
    153       },
    154       "proxy_outcome_distinction": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "RE (Refusal Elasticity) and ARC-Challenge accuracy are used as proxies for 'identity consolidation' and 'general reasoning' respectively, but the gap between these proxies and the claimed constructs is not discussed. ARC is acknowledged as limited in Section 9 but not formally distinguished as a proxy."
    158       }
    159     },
    160     "setup_transparency": {
    161       "model_versions_specified": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific model names with sizes given: Gemma-2-2B-IT, Llama-3.2-1B-Instruct, Llama-3.2-3B-Instruct, Llama-3.1-8B-Instruct. Section 6.1."
    165       },
    166       "prompts_provided": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The 'standardized steering prompts' and 'attack prompts' used for RE measurement are referenced but not provided in the paper. The 'small persona dataset' content is not shown."
    170       },
    171       "hyperparameters_reported": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No hyperparameters reported — no learning rate, batch size, optimizer, number of training steps, or fine-tuning configuration details beyond '4-bit weight quantization' for 8B."
    175       },
    176       "scaffolding_described": {
    177         "applies": false,
    178         "answer": false,
    179         "justification": "No agentic scaffolding used. This is a fine-tuning experiment."
    180       },
    181       "data_preprocessing_documented": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The persona dataset construction is described only as 'following Chen et al. (2025)' — no details on dataset size, composition, or preprocessing. The contrastive text pairs for persona direction are not described."
    185       }
    186     },
    187     "limitations_and_scope": {
    188       "limitations_section_present": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 9 'Limitations' provides substantive discussion of functional vs. metaphysical lock-in, domain specificity, resource constraints, and metric limitations."
    192       },
    193       "threats_to_validity_specific": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 9 discusses specific threats: checkpoint granularity, evaluation noise (one failed ARC run), reliance on ARC as single reasoning proxy, small-n 8B run with quantization stress, interpretability assumptions for proposed metrics."
    197       },
    198       "scope_boundaries_stated": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 9 explicitly states: MoE metrics may not transfer to dense models, consolidation may be domain-specific rather than global, 4-bit quantization 'may not reflect full-precision behavior'. Future work needs are listed."
    202       }
    203     },
    204     "data_integrity": {
    205       "raw_data_available": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Paper states 'full per-checkpoint artifacts are available' at the GitHub repository. However, this cannot be verified from the paper alone."
    209       },
    210       "data_collection_described": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "The persona dataset and steering prompt suite used for experiments are not described in detail. Only references to Chen et al. (2025) methodology."
    214       },
    215       "recruitment_methods_described": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No human participants. Data consists of model fine-tuning with synthetic persona datasets and standard benchmarks."
    219       },
    220       "data_pipeline_documented": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "The pipeline from persona direction construction → fine-tuning → checkpoint evaluation is described at a high level but without sufficient detail to reproduce. Dataset sizes, checkpoint intervals, evaluation procedures not fully specified."
    224       }
    225     },
    226     "conflicts_of_interest": {
    227       "funding_disclosed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Funding section states: 'No external funding. Work conducted independently at Gauge Freedom, Inc.'"
    231       },
    232       "affiliations_disclosed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Authors identified as affiliated with 'Gauge Freedom, Inc. (Public Benefit Corporation), Los Angeles, CA, USA.'"
    236       },
    237       "funder_independent_of_outcome": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Gauge Freedom, Inc. is an AI company that may have commercial interest in AGI safety narratives. The paper does not discuss whether the company has a stake in the outcome. Self-funded work at a company is not 'independent.'"
    241       },
    242       "financial_interests_declared": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No competing interests statement. Authors work at a company ('Gauge Freedom, Inc.') that may have financial interests related to AGI/AI safety."
    246       }
    247     },
    248     "contamination": {
    249       "training_cutoff_stated": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No training data cutoff dates stated for any of the 4 models used. ARC-Challenge could be in their training data."
    253       },
    254       "train_test_overlap_discussed": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No discussion of whether ARC-Challenge examples appeared in the training data of Gemma or Llama models."
    258       },
    259       "benchmark_contamination_addressed": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "ARC-Challenge (2018) predates all models used and is widely available online. No contamination analysis or discussion."
    263       }
    264     },
    265     "human_studies": {
    266       "pre_registered": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "irb_or_ethics_approval": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "demographics_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "inclusion_exclusion_criteria": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       },
    286       "randomization_described": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       },
    291       "blinding_described": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants."
    295       },
    296       "attrition_reported": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants."
    300       }
    301     },
    302     "cost_and_practicality": {
    303       "inference_cost_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No inference cost or training cost reported. Only mentions 'commodity hardware' and '4-bit quantization' without quantifying compute."
    307       },
    308       "compute_budget_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No GPU hours, training time, or total compute budget stated despite running fine-tuning experiments on 4 models."
    312       }
    313     },
    314     "experimental_rigor": {
    315       "seed_sensitivity_reported": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No mention of multiple random seeds. Appears to be single-run experiments for each model."
    319       },
    320       "number_of_runs_stated": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Number of runs not stated. Results appear to be from single runs per model."
    324       },
    325       "hyperparameter_search_budget": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No hyperparameter search budget reported. Fine-tuning hyperparameters not even stated."
    329       },
    330       "best_config_selection_justified": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "No discussion of configuration selection. Only one configuration appears to have been run per model."
    334       },
    335       "multiple_comparison_correction": {
    336         "applies": false,
    337         "answer": false,
    338         "justification": "Only a handful of correlations reported; correction not necessary for this number of comparisons."
    339       },
    340       "self_comparison_bias_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "Paper does not compare against baselines or re-implement others' methods, so self-comparison bias is not applicable."
    344       },
    345       "compute_budget_vs_performance": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of compute costs relative to performance. The 8B model uses quantization (reducing compute) but performance implications are discussed only qualitatively."
    349       },
    350       "benchmark_construct_validity": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "ARC-Challenge is used as a proxy for 'general reasoning' without discussing whether it actually measures what is claimed. Section 9 briefly acknowledges 'reliance on ARC as a proxy for broad reasoning' but does not discuss construct validity."
    354       },
    355       "scaffold_confound_addressed": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "No scaffolding involved — direct model evaluation."
    359       }
    360     },
    361     "data_leakage": {
    362       "temporal_leakage_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "ARC-Challenge (2018) predates all models. No discussion of whether models were trained on ARC solutions."
    366       },
    367       "feature_leakage_addressed": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No discussion of whether evaluation setup leaks information."
    371       },
    372       "non_independence_addressed": {
    373         "applies": true,
    374         "answer": false,
    375         "justification": "No discussion of independence between persona training data and evaluation data."
    376       },
    377       "leakage_detection_method": {
    378         "applies": true,
    379         "answer": false,
    380         "justification": "No leakage detection or prevention methods applied."
    381       }
    382     }
    383   },
    384   "red_flags": [
    385     {
    386       "flag": "Claims far exceed evidence",
    387       "detail": "The paper claims relevance to AGI from experiments on 4 small models (1B-8B) with a single persona and a single reasoning benchmark (ARC-Challenge). The theoretical framework is much grander than what the experiments support."
    388     },
    389     {
    390       "flag": "Single-run experiments",
    391       "detail": "All results appear to be from single runs with no seed variation. The Llama-8B model has only 5 checkpoints. No statistical tests comparing models."
    392     },
    393     {
    394       "flag": "Missing experimental details",
    395       "detail": "No hyperparameters, learning rates, dataset sizes, or training configurations reported. The persona dataset and steering prompts are not provided. Reproducibility depends entirely on the GitHub repo."
    396     },
    397     {
    398       "flag": "Quantization confound",
    399       "detail": "The 8B model uniquely uses 4-bit quantization, making it impossible to separate capacity effects from quantization effects. The paper acknowledges this but still draws conclusions."
    400     },
    401     {
    402       "flag": "No contamination analysis",
    403       "detail": "ARC-Challenge (2018) is a widely available benchmark likely in the training data of all models used. Absolute ARC scores are meaningless without contamination analysis; only relative changes across fine-tuning steps are interpretable."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Finetuned Language Models Are Zero-Shot Learners",
    409       "authors": ["J. Wei", "M. Bosma", "V. Y. Zhao"],
    410       "year": 2021,
    411       "arxiv_id": "2109.01652",
    412       "relevance": "Foundational work on instruction tuning improving zero-shot generalization, directly relevant to LLM capability evaluation."
    413     },
    414     {
    415       "title": "Scaling Instruction-Finetuned Language Models",
    416       "authors": ["H. W. Chung"],
    417       "year": 2022,
    418       "arxiv_id": "2210.11416",
    419       "relevance": "Scaling laws for instruction-tuned models, relevant to understanding how fine-tuning affects model capabilities."
    420     },
    421     {
    422       "title": "Constitutional AI: Harmlessness from AI Feedback",
    423       "authors": ["Y. Bai"],
    424       "year": 2022,
    425       "arxiv_id": "2212.08073",
    426       "relevance": "Core AI safety alignment technique for installing stable value patterns in LLMs."
    427     },
    428     {
    429       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    430       "authors": ["R. Rafailov"],
    431       "year": 2023,
    432       "relevance": "Key alignment technique (DPO) for behavioral shaping in LLMs."
    433     },
    434     {
    435       "title": "Steering Language Models with Activation Engineering",
    436       "authors": ["A. M. Turner"],
    437       "year": 2023,
    438       "arxiv_id": "2308.10248",
    439       "relevance": "Representation engineering for steering LLM behavior, directly relevant to controllability research."
    440     },
    441     {
    442       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    443       "authors": ["E. Hubinger"],
    444       "year": 2024,
    445       "arxiv_id": "2401.05566",
    446       "relevance": "Demonstrates persistent deceptive behaviors surviving safety training — key AI safety finding."
    447     },
    448     {
    449       "title": "Emergent Misalignment: Narrow Finetuning Can Produce Broadly Misaligned LLMs",
    450       "authors": ["J. Betley"],
    451       "year": 2025,
    452       "arxiv_id": "2502.17424",
    453       "relevance": "Shows narrow fine-tuning can cause broad persona shifts, relevant to alignment and safety evaluation."
    454     },
    455     {
    456       "title": "Fine-tuning Aligned Language Models Compromises Safety",
    457       "authors": ["W. Qi"],
    458       "year": 2023,
    459       "arxiv_id": "2310.06208",
    460       "relevance": "Demonstrates that fine-tuning can undo alignment, relevant to LLM safety evaluation methodology."
    461     },
    462     {
    463       "title": "Persona Vectors: Monitoring and Controlling Character Traits in Language Models",
    464       "authors": ["R. Chen"],
    465       "year": 2025,
    466       "arxiv_id": "2507.21509",
    467       "relevance": "Methodology used in this paper for constructing persona directions — directly relevant to representation engineering."
    468     },
    469     {
    470       "title": "Stress Testing Deliberative Alignment for Anti-Scheming Training",
    471       "authors": ["B. Schoen"],
    472       "year": 2025,
    473       "arxiv_id": "2509.15541",
    474       "relevance": "Evaluates alignment robustness with attention to situational awareness confounds — relevant to AI safety methodology."
    475     },
    476     {
    477       "title": "Emergent Abilities of Large Language Models",
    478       "authors": ["J. Wei"],
    479       "year": 2022,
    480       "relevance": "Foundational claim about emergent abilities in LLMs, relevant to capability evaluation methodology."
    481     },
    482     {
    483       "title": "Are Emergent Abilities of LLMs a Mirage?",
    484       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    485       "year": 2023,
    486       "arxiv_id": "2304.15004",
    487       "relevance": "Challenges emergent abilities claims as metric artifacts — important methodological critique."
    488     }
    489   ]
    490 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs