scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24284B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Human-Instruction-Free LLM Self-Alignment with Limited Samples",
      6     "authors": [
      7       "Hongyi Guo",
      8       "Yuanshun Yao",
      9       "Wei Shen",
     10       "Jiaheng Wei",
     11       "Xiaoying Zhang",
     12       "Zhaoran Wang",
     13       "Yang Liu"
     14     ],
     15     "year": 2024,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2401.06785",
     18     "doi": "10.48550/arXiv.2401.06785"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Claims of iterative self-alignment without human instructions, tested on three benchmarks with good performance, are all backed by experimental results in Tables 2–6 and Figures 2–4.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claim that iterative training outperforms one-time training is tested in Table 4 with equal data budgets (2×512 vs 1×1024 samples), a valid ablation design for this specific claim.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The conclusion claims 'superiority in terms of alignment performance, domain adaptability, and scalability' without bounding to the two model families and three specific benchmarks tested; OPT-350M Iter 1 actually worsens harmful rate vs. pretrained.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No alternative explanations are discussed; improvements are attributed entirely to retrieval-augmented ICL and iteration without considering dataset selection effects, evaluation artifact bias, or the role of in-distribution seed data.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Safety is measured by Beaver-Dam-7B classifier and utility by a reward model, both automated proxies for human values; the paper acknowledges 'proxy for human preference' for helpfulness but does not discuss the gap between classifier-measured harm and actual human judgment.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "There is no dedicated limitations or threats-to-validity section; the paper moves directly from experiments to a conclusion that is entirely positive.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No specific threats to validity are discussed anywhere in the paper, including the fact that seed training data is sampled from the same benchmarks used for evaluation.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper does not state what results do NOT show; for instance it does not note that ROUGE-L is a weak proxy for truthfulness or that evaluations are limited to two model families on three narrow benchmarks.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source is disclosed; a footnote notes the work was done during internships at ByteDance Research but this is an affiliation note, not a funding disclosure.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly listed on the first page: Northwestern University, ByteDance Research, Fudan University, and UC Santa Cruz.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The work evaluates general open-source alignment methods (LLaMA, OPT) rather than ByteDance products, so ByteDance does not have a direct financial stake in the specific experimental outcome.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement or financial interest declaration appears anywhere in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Alignment is defined in the introduction as making LLMs follow human instructions and generate safe outputs; self-alignment, ICL, and retrieval-augmented generation are explained with context and prior work.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The contribution is clearly stated: ISARA, an iterative self-alignment algorithm requiring only <100 seed examples and no human-crafted instructions or reward models.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper provides a substantive related work section and Table 1 systematically compares ISARA against nine prior self-alignment methods on four dimensions, clearly positioning the contribution.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No code repository is released; they reference the external safe-rlhf library for fine-tuning but provide no ISARA implementation link.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "All benchmarks used (BeaverTails, TruthfulQA, AlpacaEval) are publicly available; the preprocessed splits are not released but the source data is public.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Hardware (A100 80GB) and model sources (HuggingFace) are mentioned but no requirements.txt, conda environment, or Dockerfile is provided.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Hyperparameters and prompts are provided in the appendix, but no step-by-step instructions for running the ISARA pipeline are included.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "All results are point estimates only; no confidence intervals, standard deviations, or error bars are reported in any table or figure.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are reported for any comparative claims despite multiple cross-method comparisons.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Percentage harmful rates with baselines, ROUGE-L differences, and win rates against specific competitors are reported, providing interpretable effect magnitude.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The choice of 64 seed examples and 250 evaluation prompts per category is not justified with a power analysis or any reasoning about statistical adequacy.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "No variance, standard deviation, or variance across multiple runs is reported; all results appear to be single-run point estimates.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines include the pretrained model, SFT, ICL-kNN, and ICL-Random across all three benchmarks.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines include ReST, Self-Align, and ICL variants that are contemporary with the 2024 submission; the exclusion of RLHF is explained by data requirement differences.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Table 3 ablates model sizes (350M–6.7B), Table 4 compares iterative vs. single-round training with equal data, and ICL-kNN vs. ICL-Random tests the retrieval component.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Safety is evaluated on both harmlessness rate (Beaver-Dam-7B) and utility (reward model score); instruction-following uses win rates against multiple baselines.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "AlpacaEval uses GPT-4 as an automatic judge; no actual human evaluation of model outputs is conducted.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Each benchmark is split into 64 training examples and a held-out test set (250 prompts per BeaverTails category, remaining TruthfulQA questions, remaining AlpacaEval tasks).",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Safety results are broken down across three harm categories (discrimination/stereotype, hate speech/offensive language, non-violent unethical behavior) in Table 2.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "Appendix B shows qualitative examples of outputs but these are selected to showcase improvements; OPT-350M's Iter 1 regression (34.9% vs 29.5% harmful) is not analyzed or discussed as a failure.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "OPT models fail on AlpacaEval and are excluded, and OPT-6.7B on TruthfulQA still shows a negative ROUGE-L difference (-5.88) even with ISARA, both of which are reported.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Exact model identifiers are given: OPT-350M, OPT-1.3B, OPT-2.7B, OPT-6.7B, LLaMA-7B, LLaMA-2-7B, all downloaded from HuggingFace.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Appendix A.2 and A.3 show the actual prompt templates for question and answer generation with placeholders and their fill structure made explicit.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Learning rate (2×10^-5), cosine scheduler, batch size 4, beam search width 5, repetition_penalty, no_repeat_ngram_size, length_penalty, and zero-stage 2 are all reported in Appendix A.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Algorithm 1 provides full pseudocode for the ISARA pipeline including data generation, retrieval-augmented ICL, filtering, and iterative fine-tuning steps.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Appendix A.1 documents BeaverTails preprocessing (resolving contradictory annotations via majority label) and TruthfulQA train/test splitting procedure.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "The self-generated training data produced by ISARA is not released; only the source benchmarks (public) are available.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "The process for sampling seed examples and generating new examples via retrieval-augmented ICL is described in detail in Section 4.1 and Algorithm 1.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants were recruited; standard public benchmarks were used.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The full pipeline from seed data → retrieval-augmented generation → filtering → SFT fine-tuning is documented in Algorithm 1 and Sections 4.1–4.3.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "The training data cutoffs for LLaMA or OPT are never mentioned, despite TruthfulQA (2021) predating LLaMA's training data and potentially being seen during pretraining.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether the pretrained models (LLaMA, OPT) saw TruthfulQA or BeaverTails-related content during pretraining, which would inflate apparent alignment gains.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "TruthfulQA was published in 2021 before LLaMA's training cutoff; no acknowledgment or analysis of potential benchmark contamination is provided.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants; not applicable.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants; not applicable.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants; not applicable.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants; not applicable.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants; not applicable.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants; not applicable.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants; not applicable.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No inference cost or latency figures are reported; only the training hardware (A100 80G) is mentioned.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Hardware is mentioned ('one NVIDIA A100 80G GPU') but total GPU-hours, training time, or compute budget is not reported.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "ISARA outperforms SFT in safety alignment on both LLaMA-7B and OPT-6.7B across all three harm categories",
    377       "evidence": "Table 2 shows ISARA harmful rates of 1.2%, 6.0%, 9.6% vs SFT rates of 9.2%, 12.0%, 12.8% for LLaMA-7B across three categories",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Iterative training with ISARA outperforms single-round training given equal total data",
    382       "evidence": "Table 4 shows ISARA N=512 Iter 2 achieves 5.6% vs N=1024 Iter 1 at 12.8% harmful rate for LLaMA-7B with identical data volumes",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "ISARA works on models as small as 350M parameters without human-crafted instructions",
    387       "evidence": "Table 3 shows OPT-350M improves from 29.5% to 22.1% harmful rate after two ISARA iterations, though Iter 1 actually worsens to 34.9%",
    388       "supported": "weak"
    389     },
    390     {
    391       "claim": "ISARA generalizes across harm categories not seen in training",
    392       "evidence": "Figure 2 shows off-diagonal results where training on one category improves performance in others, though improvements are smaller than in-domain",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "ISARA improves harmlessness without sacrificing utility/helpfulness",
    397       "evidence": "Figure 3 shows ISARA wins 88% vs pretrained and 50% vs SFT on the utility reward model for LLaMA-7B, indicating maintained helpfulness",
    398       "supported": "weak"
    399     },
    400     {
    401       "claim": "ISARA achieves data scaling ratios exceeding 6x from seed to generated samples",
    402       "evidence": "Table 5 reports per-domain scaling ratios of 5.8x–7.2x for LLaMA-7B and OPT-6.7B on BeaverTails safety task",
    403       "supported": "strong"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval",
    408     "empirical"
    409   ],
    410   "key_findings": "ISARA demonstrates that retrieval-augmented ICL can bootstrap LLM safety, truthfulness, and instruction-following alignment using fewer than 100 seed examples and no human-crafted instructions, consistently outperforming SFT baselines across LLaMA-7B, OPT-6.7B, and LLaMA-2-7B. Iterative training produces consistent incremental gains over single-round fine-tuning with equivalent data. The approach scales down to OPT-350M, though with weaker improvements. All evaluations rely on automated metrics (classifier-based harm detection, ROUGE-L, GPT-4 judge) rather than human evaluation.",
    411   "red_flags": [
    412     {
    413       "flag": "Benchmark-internal train/test split without contamination discussion",
    414       "detail": "Seed training data is sampled from the same benchmarks used for evaluation (64 from TruthfulQA, BeaverTails); no discussion of whether pretrained LLaMA/OPT already saw this data during pretraining, particularly TruthfulQA (2021)."
    415     },
    416     {
    417       "flag": "OPT-350M Iter 1 regression not analyzed",
    418       "detail": "In Table 3, OPT-350M's harmful rate increases from 29.5% (pretrained) to 34.9% after the first ISARA iteration before recovering to 22.1% in Iter 2. This potential instability is not discussed."
    419     },
    420     {
    421       "flag": "No variance or significance testing",
    422       "detail": "All results are single-run point estimates with no confidence intervals, standard deviations, or statistical significance tests across any comparative claim."
    423     },
    424     {
    425       "flag": "No limitations section",
    426       "detail": "The paper has no limitations or threats-to-validity section; the conclusion is entirely positive without acknowledging constraints on generalizability or evaluation reliability."
    427     },
    428     {
    429       "flag": "Automated evaluation proxies treated as ground truth",
    430       "detail": "Beaver-Dam-7B classifier and a reward model are used as proxies for safety and helpfulness; the gap between these automated metrics and actual human value alignment is not discussed."
    431     },
    432     {
    433       "flag": "No code or generated data released",
    434       "detail": "ISARA implementation and self-generated training datasets are not released, preventing independent reproduction of the iterative alignment pipeline."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Training language models to follow instructions with human feedback (InstructGPT)",
    440       "relevance": "Foundational RLHF alignment technique that ISARA aims to reduce dependence on"
    441     },
    442     {
    443       "title": "Principle-driven self-alignment of language models from scratch with minimal human supervision (Self-Align)",
    444       "relevance": "Direct predecessor self-alignment method that still requires human-crafted principles; primary comparison target"
    445     },
    446     {
    447       "title": "Self-Instruct: Aligning language model with self generated instructions",
    448       "relevance": "Key prior work on instruction data bootstrapping; requires GPT-3 175B and human seed instructions"
    449     },
    450     {
    451       "title": "Reinforced self-training (ReST) for language modeling",
    452       "relevance": "Closest iterative self-alignment competitor using learned reward models instead of human instructions"
    453     },
    454     {
    455       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    456       "relevance": "Core technique (RAG) used as the basis for ISARA's answer generation component"
    457     },
    458     {
    459       "title": "LIMA: Less is more for alignment",
    460       "relevance": "Related finding that 1,000 examples suffice for alignment; motivates ISARA's limited-sample setting"
    461     },
    462     {
    463       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    464       "relevance": "One of three benchmarks used for evaluation"
    465     },
    466     {
    467       "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset",
    468       "relevance": "Primary benchmark and evaluation classifier for safety alignment experiments"
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 2,
    474       "justification": "Reducing alignment data requirements to <100 examples without human instruction writing addresses a real bottleneck for practitioners working on domain-specific alignment."
    475     },
    476     "surprise_contrarian": {
    477       "score": 2,
    478       "justification": "Challenges the prevailing assumption that self-alignment requires human-crafted principles or large models (65B+), showing it works down to 350M."
    479     },
    480     "fear_safety": {
    481       "score": 1,
    482       "justification": "Touches AI safety alignment but framed as a positive capability paper rather than raising risks; low fear factor."
    483     },
    484     "drama_conflict": {
    485       "score": 1,
    486       "justification": "Positions against established methods from large labs but the tone is purely technical with no controversy angle."
    487     },
    488     "demo_ability": {
    489       "score": 2,
    490       "justification": "Uses open-source models (LLaMA, OPT) available on HuggingFace; method is implementable in principle, though no code is released."
    491     },
    492     "brand_recognition": {
    493       "score": 1,
    494       "justification": "ByteDance Research affiliation and Northwestern University; not a top-tier alignment lab, reducing visibility despite relevant topic."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [],
    499     "top_points": 0,
    500     "total_points": 0,
    501     "total_comments": 0
    502   }
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs