scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23969B)
      1 {
      2   "paper": {
      3     "title": "Constitutional AI: Harmlessness from AI Feedback",
      4     "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu", "Amanda Askell", "Jackson Kernion", "Andy Jones", "Anna Chen", "Anna Goldie", "Azalia Mirhoseini", "Cameron McKinnon", "Carol Chen", "Catherine Olsson", "Christopher Olah", "Danny Hernandez", "Dawn Drain", "Deep Ganguli", "Dustin Li", "Eli Tran-Johnson", "Ethan Perez", "Jamie Kerr", "Jared Mueller", "Jeffrey Ladish", "Joshua Landau", "Kamal Ndousse", "Kamile Lukosuite", "Liane Lovitt", "Michael Sellitto", "Nelson Elhage", "Nicholas Schiefer", "Noemi Mercado", "Nova DasSarma", "Robert Lasenby", "Robin Larson", "Sam Ringer", "Scott Johnston", "Shauna Kravec", "Sheer El Showk", "Stanislav Fort", "Tamera Lanham", "Timothy Telleen-Lawton", "Tom Conerly", "Tom Henighan", "Tristan Hume", "Samuel R. Bowman", "Zac Hatfield-Dodds", "Ben Mann", "Dario Amodei", "Nicholas Joseph", "Sam McCandlish", "Tom Brown", "Jared Kaplan"],
      5     "year": 2022,
      6     "venue": "arXiv",
      7     "arxiv_id": "2212.08073"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository released: https://github.com/anthropics/ConstitutionalHarmlessnessPaper (Section 1.3, footnote 6), containing few-shot prompts, constitutional principles, and model responses."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The red teaming data references prior work (Ganguli et al., 2022) available at https://github.com/anthropics/hh-rlhf, but the actual training data (182,831 constitutional comparisons, SL-CAI finetuning data) generated for this paper is not released. The repository contains prompts and samples but not the full training datasets."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, requirements files, or dependency versions are mentioned. The paper describes model sizes but not the software environment needed to reproduce."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The method is described at a high level but there are no scripts or README with commands to replicate experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Error bars are shown in Figure 3 for Elo scores across model sizes. Figure 2 notes 'Error bars are visible in Figure 3 but are suppressed here for clarity.'"
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No statistical significance tests are reported. Claims like 'RL-CAI models are significantly more harmless' (Section 4.3) are based on Elo score comparisons without formal tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are contextualizable via Elo score differences shown in Figures 2, 3, and 8, with baselines for comparison. Absolute harmfulness scores (0-4 scale) in Figure 10 provide magnitude context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Sample sizes are stated (10,274 helpfulness and 8,135 harmlessness comparisons for AB testing, Section 3.3; 64 held-out red team prompts for absolute scores, Section 4.5) but no justification or power analysis is provided."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations or variance across runs are reported. Error bars appear in Figure 3 but it is not stated whether these represent std dev, confidence intervals, or something else. No mention of multiple experimental runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines: helpful RLHF, HH RLHF (trained with human harmlessness feedback), SL-CAI, and pretrained LMs are compared (Figures 2, 3, 8)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines are from the authors' own concurrent work (Bai et al., 2022) and represent the state of the art for RLHF-trained assistants at the time."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Several ablations: critiqued vs. direct revisions (Figure 7), varying number of constitutional principles (Figure 6), varying number of revisions (Figure 5), soft vs. hard vs. clamped preference labels (Section 4.3), with and without chain-of-thought (Figures 2, 3)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics: helpfulness Elo, harmlessness Elo (Figures 2, 3), absolute harmfulness scores on 0-4 scale (Figure 10), HHH evaluation accuracy (Figure 4), PM scores (Figure 5), calibration (Figure 9)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Crowdworker evaluations via model comparison tests produce the Elo scores (Section 3.3). Absolute harmfulness ratings from crowdworkers are also used (Section 4.5)."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 3.3 states comparison test conversations are 'similar in distribution to, but distinct from, those appearing in the PM and RL training data.' Section 4.5 uses '64 hand-picked held-out red team prompts.'"
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results broken down by model size (Figure 3), by number of revisions (Figure 5), by number of principles (Figure 6), by RL training step (Figure 8). Appendix B breaks down harm classification into 9 categories."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.3 discusses Goodharting/over-training failure modes with concrete examples of overly harsh or boilerplate responses ('you are valid, valued, and cared for'). Section 3.5 notes critiques are 'sometimes reasonable, but often made inaccurate or overstated criticisms.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Reports that helpfulness scores decrease with revisions (Figure 5 center), that SL-CAI is less helpful than RL models (Section 3.3), that CoT is slightly less helpful (Section 4.3), and that over-training causes Goodharting (Section 4.3)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about training harmless but non-evasive assistants without human harmlessness labels are supported by Elo comparisons (Figures 2, 3) and qualitative examples (Appendix D). Claim that RL-CAI is 'preferred by crowdworkers' is supported by Section 3.3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims are supported by controlled ablations: critique vs. no critique (Figure 7), varying revisions (Figure 5), varying principles (Figure 6), CoT vs. no CoT (Figures 2, 3). These are single-variable manipulations within the same experimental framework."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes broad claims about 'Constitutional AI' as a general method but tests only on their own proprietary models at specific sizes (up to 52B). No testing on other model families, languages, or domains. The title and framing suggest generality beyond what is tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 4.4 discusses how instruction changes to crowdworkers may explain differences from prior work. Section 4.5 caveats that 'absolute scores may not be well-calibrated, as different workers may have their own personal biases.' Section 3.4 notes PM scores 'become less calibrated at higher values.'"
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are described only by parameter count (e.g., '52B') and reference to 'pretrained in the way we described in prior work [Bai et al., 2022].' No specific model version names, snapshot dates, or release identifiers are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full constitutional principles for both SL-CAI and RL-CAI are provided in Appendix C. Few-shot examples are referenced as being in Appendix E and the GitHub repository. The actual prompt formatting is shown in Sections 3.1 and 4.1."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3.2: temperature T=1 for sampling, learning rate 0.5 relative to pretraining LR, batch size 1024. Section 4.3: probability clamping at 40-60%. Section 4.2: 'same hyperparameters as our prior work [Bai et al., 2022].'"
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The method is a training pipeline (SL finetuning + RL), not an agentic system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.2 documents data sources and counts: 42,496 human-written red team prompts + 140,335 model-generated = 182,831 total; 135,296 helpfulness prompts; 4 revisions per red team prompt; 2 responses per helpfulness prompt. Section 4.2 provides PM comparison data counts."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 (Discussion) and Section 6.2 (Broader Impacts) discuss limitations including dual-use concerns, risk of deploying insufficiently tested models, and remaining reliance on helpfulness labels."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats discussed: PM score calibration issues at high values (Section 3.4), crowdworker instruction changes affecting Elo comparisons (Section 4.4), Goodharting from over-training (Section 4.3), constitutional principles 'selected in a fairly ad hoc manner' (footnote 7)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to specific model families, languages, or deployment contexts. Section 6.1 frames remaining work as 'future directions' rather than explicit scope boundaries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw crowdworker comparison data and training data are not released. Only the constitutional principles and sample outputs are available in the GitHub repository."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.2 describes data collection: red team prompts from Ganguli et al. (2022) plus model-generated prompts. Section 3.3 describes crowdworker evaluation collection (10,274 helpfulness + 8,135 harmlessness comparisons). Section 4.4 describes instruction differences for crowdworkers."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Crowdworker platforms identified: Surge AI for current comparison tests, Upwork and MTurk for prior PM data collection (Section 4.4). Red teaming crowdworkers described in Ganguli et al. (2022)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline is documented: red team prompts → model responses → critiques → revisions → SL finetuning (Section 3.1-3.2); response pairs → AI feedback labels → PM training → RL (Section 4.1-4.2). Data counts at each stage are provided."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding disclosure or acknowledgment of funding sources. All authors are from Anthropic but no explicit funding statement is provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors listed as affiliated with Anthropic, clearly stated at the top of the paper."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Anthropic employees are evaluating a method (Constitutional AI) developed at Anthropic for training Anthropic's own models. The funder/employer has a direct commercial interest in the success of the method."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement. No disclosure of equity, patents, or other financial interests related to Constitutional AI or Anthropic's products."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff date is stated for the pretrained models used. The paper references models 'pretrained in the way we described in prior work' without specifying when training data was collected."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the HHH evaluation questions (from Askell et al., 2021 and newly written ones) could overlap with pretraining data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The HHH evaluations from Askell et al. (2021) were published before model training, and the BIG Bench evaluations (Srivastava et al., 2022) were also public. No contamination analysis is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No pre-registration mentioned. Crowdworker evaluations are used for model comparison but the study was not pre-registered."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No mention of IRB or ethics board approval for the crowdworker studies."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No demographics reported for crowdworkers performing evaluations on any of the three platforms (Surge AI, MTurk, Upwork)."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion/exclusion criteria for crowdworkers described beyond platform membership."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "This is not an experimental study with human participants assigned to conditions. Crowdworkers evaluate model outputs but are not randomized into treatment/control groups."
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No mention of whether crowdworkers knew which model produced which response during comparison tests."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No attrition or dropout information reported for crowdworkers."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs, API costs, or latency figures reported despite the method requiring multiple rounds of model sampling (4 critique-revision pairs per prompt, multiple RL training runs)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No GPU hours, total compute, or training time reported for any of the experiments (SL finetuning, RL training, preference model training)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "RL-CAI models trained with AI feedback are preferred by crowdworkers over models trained with human feedback labels for harmlessness.",
    286       "evidence": "Figures 2 and 3 show Elo scores from crowdworker comparisons where RL-CAI achieves higher harmlessness scores than HH RLHF. Section 3.3 reports 10,274 helpfulness and 8,135 harmlessness comparisons.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Constitutional AI achieves a Pareto improvement: higher harmlessness at a given level of helpfulness compared to standard RLHF.",
    291       "evidence": "Figure 2 shows the helpfulness-harmlessness Elo frontier, with RL-CAI points above and to the right of HH RLHF points.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Chain-of-thought reasoning improves AI identification of harmful behavior, approaching human feedback-trained preference model performance.",
    296       "evidence": "Figure 4 shows CoT and ensembled CoT accuracy on 438 HHH binary comparisons approaching PM accuracy at 52B parameters.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Model-generated critiques and revisions progressively reduce harmfulness.",
    301       "evidence": "Figure 5 shows monotonically increasing harmlessness PM scores across 0-4 revisions. Figure 7 compares critiqued vs direct revisions.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "RL-CAI is virtually never evasive, unlike HH RLHF which frequently gives canned refusal responses.",
    306       "evidence": "Qualitative examples in Appendix D show HH RLHF responding with 'I'm sorry. I won't respond' while RL-CAI gives substantive responses. Section 4.4 discusses this qualitatively.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "observational"],
    311   "key_findings": "Constitutional AI (CAI) trains harmless but non-evasive AI assistants using only a set of natural language principles (a 'constitution') instead of human feedback labels for harmlessness. The two-stage process (supervised critique-revision followed by RL from AI feedback) achieves higher harmlessness than RLHF models trained with human harmlessness labels while maintaining comparable helpfulness, as measured by crowdworker Elo scores. Chain-of-thought reasoning improves AI feedback quality and approaches human preference model accuracy. The method significantly reduces evasiveness compared to standard HH RLHF models.",
    312   "red_flags": [
    313     {
    314       "flag": "Company evaluating own method",
    315       "detail": "All authors are Anthropic employees evaluating a method (Constitutional AI) that directly benefits Anthropic's product development. No external evaluation or independent replication."
    316     },
    317     {
    318       "flag": "No compute costs reported",
    319       "detail": "The method requires training multiple large models (up to 52B parameters), generating hundreds of thousands of critique-revision pairs, and running RL training, but no compute costs or GPU hours are reported."
    320     },
    321     {
    322       "flag": "Crowdworker instruction change confounds comparison",
    323       "detail": "Section 4.4 acknowledges that crowdworker instructions were changed from prior work to penalize evasiveness. This makes direct comparison with prior RLHF results unreliable, as the evaluation criteria shifted."
    324     },
    325     {
    326       "flag": "No statistical tests on main claims",
    327       "detail": "Claims of 'significantly more harmless' are based on Elo score differences without any formal statistical testing."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    333       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    334       "year": 2022,
    335       "relevance": "Foundation work on RLHF for AI assistants; establishes helpfulness-harmlessness tradeoff that CAI addresses."
    336     },
    337     {
    338       "title": "Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned",
    339       "authors": ["Deep Ganguli", "Liane Lovitt", "Jackson Kernion"],
    340       "year": 2022,
    341       "relevance": "Red teaming methodology and datasets used for CAI training and evaluation."
    342     },
    343     {
    344       "title": "Training language models to follow instructions with human feedback",
    345       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    346       "year": 2022,
    347       "arxiv_id": "2203.02155",
    348       "relevance": "InstructGPT paper; RLHF method that CAI extends by replacing human harmlessness labels with AI feedback."
    349     },
    350     {
    351       "title": "Improving alignment of dialogue agents via targeted human judgements",
    352       "authors": ["Amelia Glaese", "Nat McAleese", "Maja Trebacz"],
    353       "year": 2022,
    354       "relevance": "Sparrow model using rule-based decomposition of harmlessness, similar in spirit to CAI's constitutional principles."
    355     },
    356     {
    357       "title": "Deep reinforcement learning from human preferences",
    358       "authors": ["Paul Christiano", "Jan Leike", "Tom B. Brown"],
    359       "year": 2017,
    360       "relevance": "Foundational RLHF method that CAI builds upon and partially replaces."
    361     },
    362     {
    363       "title": "Language models (mostly) know what they know",
    364       "authors": ["Saurav Kadavath", "Tom Conerly", "Amanda Askell"],
    365       "year": 2022,
    366       "relevance": "Calibration of language model outputs used to justify soft preference labels in RLAIF."
    367     },
    368     {
    369       "title": "Chain of thought prompting elicits reasoning in large language models",
    370       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    371       "year": 2022,
    372       "relevance": "Chain-of-thought technique used in CAI to improve AI feedback quality and transparency."
    373     },
    374     {
    375       "title": "Scaling laws for reward model overoptimization",
    376       "authors": ["Leo Gao", "John Schulman", "Jacob Hilton"],
    377       "year": 2022,
    378       "relevance": "Reward model Goodharting analysis relevant to CAI's observed over-training failure modes."
    379     },
    380     {
    381       "title": "Measuring progress on scalable oversight for large language models",
    382       "authors": ["Samuel R. Bowman", "Jeeyoon Hyun", "Ethan Perez"],
    383       "year": 2022,
    384       "relevance": "Empirical work on scalable oversight directly motivating CAI's approach to AI supervision."
    385     },
    386     {
    387       "title": "A general language assistant as a laboratory for alignment",
    388       "authors": ["Amanda Askell", "Yuntao Bai", "Anna Chen"],
    389       "year": 2021,
    390       "relevance": "Defines helpful, harmless, honest (HHH) framework and evaluation data used in CAI."
    391     },
    392     {
    393       "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models",
    394       "authors": ["Aarohi Srivastava"],
    395       "year": 2022,
    396       "relevance": "BIG Bench includes HHH evaluations used to benchmark CAI feedback quality."
    397     }
    398   ]
    399 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs