scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33845B)
      1 {
      2   "paper": {
      3     "title": "I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative Self-Enhancement Paradigm",
      4     "authors": [
      5       "Yiming Liang",
      6       "Ge Zhang",
      7       "Xingwei Qu",
      8       "Tianyu Zheng",
      9       "Jiawei Guo",
     10       "Xinrun Du",
     11       "Zhenzhu Yang",
     12       "Jiaheng Liu",
     13       "Chenghua Lin",
     14       "Lei Ma",
     15       "Wenhao Huang",
     16       "Jiajun Zhang"
     17     ],
     18     "year": 2024,
     19     "venue": "arXiv",
     20     "arxiv_id": "2408.08072",
     21     "doi": "10.48550/arXiv.2408.08072"
     22   },
     23   "scan_version": 3,
     24   "active_modules": ["experimental_rigor", "data_leakage"],
     25   "methodology_tags": ["benchmark-eval"],
     26   "key_findings": "I-SHEEP enables iterative self-improvement of LLMs using self-generated synthetic data with metacognitive self-assessment filtering, achieving up to 78.2% relative improvement on AlpacaEval and 8.88% absolute gain on IFEval for Qwen-1.5 72B over multiple iterations. Improvement potential scales with model size: 1.8B-14B models plateau after 2 iterations while 72B sustains improvement through 5 iterations. Training the base model with data from the latest iteration outperforms cumulative data or iterative fine-tuning, and higher levels of self-assessment (evaluating both quality and instruction-following) yield the largest gains.",
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The abstract states 'Our code, datasets, and models are available at https://anonymous.4open.science/r/SHEEP/' — an anonymous repository link is provided, though it is an anonymous review artifact rather than a permanent hosting location."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The abstract mentions datasets are available at the same anonymous link. The seed data is based on the publicly available Alpaca seed dataset (175 prompts), and generated datasets are claimed to be released."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Appendix E lists LoRA hyperparameters and mentions LLaMA-Factory and VLLM, and Section 4.2 mentions 'NVIDIA A800-SXM4-80GB×8', but no requirements.txt, Dockerfile, or dependency version list is provided. The specific library versions needed to recreate the environment are not stated."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Algorithm 1 provides a high-level algorithmic description and Appendix E lists training hyperparameters, but there are no step-by-step reproduction instructions (e.g., commands to run, scripts to execute) that would allow a researcher to replicate the experiments without significant guesswork."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "All results in Tables 1-5 and Figures 2a-2b are reported as point estimates with no confidence intervals, error bars, or ± notation."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper claims improvements across iterations (e.g., '78.2% relative improvement') based solely on comparing raw numbers with no statistical significance tests (no p-values, t-tests, or any hypothesis testing)."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper provides relative improvements with baseline context, e.g., '78.2% in Alpaca Eval' (relative), '8.88% absolute increase in IFEval,' and Table 1 shows both baseline and improved values, allowing readers to assess magnitude."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Data size (10k) is chosen based on ablation results (Section 5.4) for pragmatic reasons ('resource savings'), but there is no formal justification or power analysis for why this sample size is sufficient for the claims made."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "All results appear to be from single experimental runs. No standard deviations, variance across seeds, or any spread measures are reported anywhere in the paper."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper compares against the base model, Self-Instruct (Wang et al., 2022b), and Dromedary (Sun et al., 2023b) as baselines (Section 4.4, Table 1)."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Self-Instruct (2022) and Dromedary (2023) are the most relevant prior methods for self-alignment in low-resource settings, published within 1-2 years of this work."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Extensive ablations are conducted: threshold values (Figure 2a, Section 5.3), data size (Figure 2b, Section 5.4), iterative settings (Table 2, Section 5.2), filtering methods (Table 3), and self-assessment levels (Table 4, Section 5.5.2)."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper evaluates on AlpacaEval, MT-Bench, IFEval (4 accuracy variants), HumanEval/Plus, MBPP, TriviaQA, SQuAD 2.0, plus standard NLU benchmarks (BoolQ, PIQA, SIQA, HellaSwag, etc.)."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No human evaluation is included. AlpacaEval and MT-Bench use GPT-as-a-judge, and all other evaluations are automated. The paper acknowledges relying on automated benchmarks."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Evaluation is on established benchmarks (AlpacaEval, MT-Bench, IFEval, HumanEval, MBPP, etc.) with predefined test splits. The training data is self-generated synthetic data, so there is clear separation between training and evaluation data."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Table 12 (Appendix D) provides per-category MT-Bench breakdowns (coding, extraction, humanities, math, reasoning, role play, STEM, writing). Results are also broken down by model size (1.8B through 72B) in Table 1."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper discusses that second-round dialogue scores drop significantly after the 4th iteration (Section 5.1, Table 12), noting 'This decline is likely due to our generated data consisting solely of single-round dialogues.' Case studies of poor-quality outputs are shown in Appendix B."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Several negative results are reported: smaller models (1.8B-14B) plateau after 2 iterations, PPL filtering underperforms self-assessment-based filtering (Table 3), and performance degrades on 2nd-turn dialogue after iteration 4 (Table 12). The paper stops iterations when 'performance improvement stagnates or diminishes.'"
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract claims '78.2% in Alpaca Eval, 24.0% in MT Bench, 8.88% in IFEval' are directly supported by Table 1 (72B results). Code generation (24.77%), TriviaQA (12.04%), and SQuAD (20.29%) improvements are also supported in Table 1."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Causal claims like 'I-SHEEP enables iterative self-improvement' are supported by controlled ablations: comparing with/without self-assessment (Table 4), different filtering methods (Table 3), and different iterative settings (Table 2). The ablation design isolates individual components through single-variable manipulation."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title 'Self-Alignment of LLM from Scratch' and abstract claim broad applicability to LLMs generally, but experiments are limited to Qwen-1.5 (6 sizes) and Llama-3 (70B only). No testing on other model families (Mistral, Gemma, Phi, etc.). The Llama-3 evaluation is limited to IFEval only (Table 5), far less comprehensive than Qwen experiments."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, it does not consider whether the gains come simply from additional fine-tuning steps (more gradient updates) rather than the self-assessment mechanism, or whether the iterative data generation is just a form of data augmentation. No robustness checks against these confounds."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper measures benchmark scores (AlpacaEval, MT-Bench, IFEval) and frames them as evidence of 'self-alignment' and 'self-improvement,' but does not discuss the gap between benchmark performance and actual alignment quality. The concept of 'alignment' encompasses much more than instruction-following accuracy, but this distinction is not acknowledged."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Specific model versions are stated: 'Qwen-1.5' with sizes 1.8B/4B/7B/14B/32B/72B (Section 4.2) and 'Llama-3' 70B (Section 5.6). These are identifiable model releases with specific parameter counts."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix A provides the full text of all three self-assessment prompt variants (Simple Standard, Combined Standard, and ICL) including exact wording and format specifications. The ICL prompt includes complete examples with scores."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Appendix E provides detailed LoRA hyperparameters: learning rate (5e-5), batch size (1), gradient accumulation steps (2), LoRA rank (8), alpha (16), dropout (0.05), cutoff length (1024), epochs (2.0), scheduler (cosine), warmup steps (20), and bf16 training."
    166       },
    167       "scaffolding_described": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No agentic scaffolding is used. I-SHEEP is a training pipeline (data generation → self-assessment → filtering → SFT) with no agent, tool use, or iterative reasoning at inference time."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 3.2 describes the two-stage filtering process: heuristic rule-based filtering following Self-Instruct (Section 3.2), then self-assessment score-based filtering with threshold C. Section 4.3 details PPL filtering (threshold 50), density filtering (K=3000 clusters), and the combination approach."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 7 is explicitly titled 'Limitations' and discusses two specific issues: uncertainty about performance after the RLHF phase and ethical concerns about synthetic data bias."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The limitations section is only two sentences long. The RLHF point ('the extent of final improvement after the RLHF phase remains uncertain') is somewhat specific, but the ethical concern ('synthetic data may intensify biases and harmful content') is generic. No discussion of threats specific to the experimental validity of the reported results."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of populations, settings, or model types excluded from the claims. The limitations section does not specify which conclusions should not be drawn from the work."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The abstract states 'Our code, datasets, and models are available at https://anonymous.4open.science/r/SHEEP/' — the generated datasets used for training are claimed to be released, which would allow verification of the data pipeline."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Section 3.1 describes the data synthesis process in detail: starting from 175 Alpaca seed prompts, using ICL to generate new instructions, then zero-shot response generation. The process is formalized mathematically with equation for instruction generation."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No human participants are involved. All data is synthetically generated by the LLMs themselves from a seed dataset. No recruitment applies."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Algorithm 1 documents the full pipeline from seed data through prompt generation, response generation, self-assessment, filtering, and training. Figure 3 shows the proportion of high-quality data retained at each iteration. Filtering thresholds and criteria are specified."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding sources or acknowledgments section is present in the paper. Several authors are affiliated with 01.ai and M-A-P, but no funding disclosure is provided."
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Author affiliations are clearly listed: University of Chinese Academy of Sciences, Institute of Automation CAS, University of Waterloo, University of Manchester, M-A-P, 01.ai, Peking University, and BAAI."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No funding source is disclosed, making it impossible to assess funder independence. Authors from 01.ai (an AI company) have potential commercial interest in alignment methods, but this is not discussed."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is present in the paper. Multiple authors are affiliated with commercial entities (01.ai) that could benefit from the findings."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The paper uses Qwen-1.5 and Llama-3 base models but does not state their training data cutoff dates. This is critical since they evaluate on benchmarks (HumanEval, MBPP, TriviaQA, SQuAD) that predate these models."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether benchmark examples (HumanEval, MBPP, TriviaQA, SQuAD, etc.) appeared in the base models' pretraining data. The base model performance improvements could partly reflect memorized benchmark answers being surfaced by fine-tuning."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "HumanEval (2021), MBPP (2021), TriviaQA (2017), and SQuAD (2018) were all published well before Qwen-1.5 and Llama-3 training. The paper does not address the contamination risk for any of these benchmarks."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved in this study. All experiments are automated benchmark evaluations and synthetic data generation."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved in this study."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved in this study."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved in this study."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants are involved in this study."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants are involved in this study."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants are involved in this study."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Section 4.2 states 'the maximum duration of each iteration is about 4 hours on NVIDIA A800-SXM4-80GB×8, equivalent to one iteration time for Qwen-1.5 72B,' providing wall-clock time and hardware for the largest model configuration."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Section 4.2 reports 4 hours per iteration on 8×A800 GPUs for the 72B model. With 5 productive iterations for 72B, the total compute for the main experiment can be estimated at ~20 hours on this hardware."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No results across multiple random seeds are reported. All experiments appear to be single-run, with no analysis of seed sensitivity for either training or inference."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The number of experimental runs is never explicitly stated. Results are presented without indicating whether they come from a single run or are averaged over multiple runs."
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "While ablations over threshold and data size are provided (Sections 5.3-5.4), the LoRA training hyperparameters appear fixed without reporting a search budget. No mention of how many configurations were tried for learning rate, LoRA rank, etc."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Section 5.3 justifies threshold=8 selection based on ablation trends, and Section 5.4 justifies 10k data size based on resource efficiency and performance at iteration 3. The reasoning for configuration choices is transparent."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The paper makes numerous comparisons across model sizes, iterations, filtering methods, and benchmarks without any statistical tests or multiple comparison corrections."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors implement their own versions of Self-Instruct and Dromedary baselines. There is no acknowledgment of potential author-implementation bias, and no independent evaluation is conducted."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Each iteration adds ~4 hours of compute (for 72B), but performance is not plotted as a function of compute. The 'Direct 20k/30k' comparison (Table 2) partially controls for data size but not compute. Whether simply training longer on more data would achieve similar gains is not analyzed."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "The paper uses AlpacaEval, MT-Bench, and IFEval as proxies for 'alignment' and 'self-improvement' without discussing whether these benchmarks actually measure the broad concept of alignment claimed in the paper."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": false,
    348         "answer": false,
    349         "justification": "No scaffolding is involved. I-SHEEP is a training pipeline comparing different data generation and filtering approaches, not scaffolded inference systems."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of temporal leakage. Benchmarks like HumanEval (2021), MBPP (2021), TriviaQA (2017), and SQuAD (2018) were all available before Qwen-1.5 and Llama-3 training, meaning solutions could be in the pretraining data."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of feature leakage. The self-generated training data could inadvertently encode patterns from benchmark tasks if the base model has memorized them, creating a feedback loop that inflates evaluation scores."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether the self-generated training data and the evaluation benchmarks share structural similarities or overlap, which is a concern when the same model generates training data and is evaluated on benchmarks it may have memorized."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline is mentioned."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "I-SHEEP achieves a maximum relative improvement of 78.2% in AlpacaEval, 24.0% in MT-Bench, and 8.88% absolute increase in IFEval for Qwen-1.5 72B over iterative self-enhancement.",
    378       "evidence": "Table 1 shows Qwen-1.5 72B improving from iter1 to iter5: AlpacaEval 6.64→11.83, MT-Bench 6.43→7.97 (peak at iter3), IFEval prompt-level strict-accuracy 35.67→44.55.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "The potential for iterative improvement varies with model size — larger models can sustain more productive iterations.",
    383       "evidence": "Table 1 shows 1.8B-14B models improve over 2 iterations then plateau, 32B improves over 3 iterations, and 72B sustains improvement through 5 iterations before stagnating at iteration 6.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Training the base model with data from the last iteration (One_base) is the most effective iterative setting.",
    388       "evidence": "Table 2 shows One_base at iter3 (AlpacaEval 10.51, IFEval P-level 41.22) outperforms One_last (8.45, 38.63), Total_base (7.51, 37.52), and Direct 20k/30k settings across chat benchmarks.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Higher levels of metacognitive self-assessment lead to greater improvement in the I-SHEEP framework.",
    393       "evidence": "Table 4 shows that assessing both quality and instruction-following ('both') achieves best IFEval performance at iter3 (P-level 41.14, I-level 54.08) compared to quality-only (37.71, 51.44), following-only (39.93, 51.68), or no assessment (37.52, 48.92).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "I-SHEEP generalizes to Llama-3 70B, showing stable iterative improvement.",
    398       "evidence": "Table 5 shows Llama-3 70B improving from iter1 to iter3 on IFEval (P-level 9.43→12.38, I-level 19.06→20.98). However, the evaluation is limited to IFEval only and absolute scores are much lower than Qwen.",
    399       "supported": "weak"
    400     },
    401     {
    402       "claim": "I-SHEEP surpasses the base model on standard benchmarks: 24.77% improvement in code generation, 12.04% in TriviaQA, and 20.29% in SQuAD.",
    403       "evidence": "Table 1 (72B) shows code generation HumanEval improving from 21.34→56.71 (+35.37), MBPP from 50.20→56.00, TriviaQA from 58.07→70.43, and SQuAD from 47.66→67.95. The claimed averages are supported.",
    404       "supported": "strong"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "No error bars or variance reporting",
    410       "detail": "All results across all tables and figures appear to be from single experimental runs. No standard deviations, confidence intervals, or variance across seeds are reported for any experiment, despite making numerous comparative claims."
    411     },
    412     {
    413       "flag": "No contamination analysis",
    414       "detail": "The paper evaluates on HumanEval, MBPP, TriviaQA, SQuAD, and other benchmarks published years before Qwen-1.5 and Llama-3 training, but never discusses whether base models may have memorized benchmark answers. Fine-tuning could surface memorized solutions, inflating apparent improvement."
    415     },
    416     {
    417       "flag": "Circular self-assessment validation",
    418       "detail": "The model assesses the quality of its own generated data and uses this to filter training data. The validity of this self-assessment is never independently verified — the paper shows self-assessment filtering helps downstream benchmarks but not whether the self-assigned scores correlate with actual data quality as judged by humans or external models."
    419     },
    420     {
    421       "flag": "Llama-3 generalization claim is thin",
    422       "detail": "The claim that I-SHEEP 'generalizes' to Llama-3 is based on only IFEval results for a single model size (70B) with very low absolute scores (9-12% prompt-level accuracy), far below Qwen-1.5 72B (35-44%). No other benchmarks are reported for Llama-3."
    423     },
    424     {
    425       "flag": "Anonymous repository link",
    426       "detail": "The code/data link (https://anonymous.4open.science/r/SHEEP/) is an anonymous review artifact, not a permanent repository. Reproducibility depends on this link being maintained and transitioned to a permanent host."
    427     },
    428     {
    429       "flag": "No comparison with compute-matched baselines",
    430       "detail": "Each iteration adds ~4 hours of compute on 8×A800 GPUs. The paper does not compare against simply training longer on more diverse data with the same compute budget, leaving open whether iterative self-enhancement is more effective than alternative uses of the same compute."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
    436       "authors": ["Yizhong Wang", "Yeganeh Kordi", "Swaroop Mishra", "Alisa Liu", "Noah A. Smith", "Daniel Khashabi", "Hannaneh Hajishirzi"],
    437       "year": 2022,
    438       "arxiv_id": "2212.10560",
    439       "relevance": "Core baseline method for LLM self-alignment via self-generated instruction data, directly compared against in experiments."
    440     },
    441     {
    442       "title": "Principle-Driven Self-Alignment of Language Models from Scratch with Minimal Human Supervision",
    443       "authors": ["Zhiqing Sun", "Yikang Shen", "Qinhong Zhou", "Hongxin Zhang", "Zhenfang Chen", "David Cox", "Yiming Yang", "Chuang Gan"],
    444       "year": 2023,
    445       "relevance": "Dromedary method — primary baseline for principle-guided self-alignment of LLMs in low-resource settings."
    446     },
    447     {
    448       "title": "Self-Rewarding Language Models",
    449       "authors": ["Weizhe Yuan", "Richard Yuanzhe Pang", "Kyunghyun Cho", "Sainbayar Sukhbaatar", "Jing Xu", "Jason Weston"],
    450       "year": 2024,
    451       "arxiv_id": "2401.10020",
    452       "relevance": "Related iterative self-improvement method using self-reward for RLHF-phase alignment, relevant to understanding iterative LLM enhancement approaches."
    453     },
    454     {
    455       "title": "Meta-Rewarding Language Models: Self-Improving Alignment with LLM-as-a-Meta-Judge",
    456       "authors": ["Tianhao Wu", "Weizhe Yuan", "Olga Golovneva", "Jing Xu", "Yuandong Tian", "Jiantao Jiao", "Jason Weston", "Sainbayar Sukhbaatar"],
    457       "year": 2024,
    458       "arxiv_id": "2407.19594",
    459       "relevance": "Explores LLM self-improvement through meta-judging, directly relevant to self-assessment mechanisms for alignment."
    460     },
    461     {
    462       "title": "Self-Alignment with Instruction Backtranslation",
    463       "authors": ["Xian Li", "Ping Yu", "Chunting Zhou", "Timo Schick", "Luke Zettlemoyer", "Omer Levy", "Jason Weston", "Mike Lewis"],
    464       "year": 2023,
    465       "arxiv_id": "2308.06259",
    466       "relevance": "Alternative self-alignment approach using instruction backtranslation from raw text, relevant to understanding LLM active alignment capabilities."
    467     },
    468     {
    469       "title": "IterAlign: Iterative Constitutional Alignment of Large Language Models",
    470       "authors": ["Xiusi Chen", "Hongzhi Wen", "Sreyashi Nag", "Chen Luo", "Qingyu Yin", "Ruirui Li", "Zheng Li", "Wei Wang"],
    471       "year": 2024,
    472       "relevance": "Iterative alignment method using strong models (GPT-4, Claude2) for error correction, relevant comparison for iterative LLM enhancement approaches."
    473     },
    474     {
    475       "title": "Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study",
    476       "authors": ["Shusheng Xu", "Wei Fu", "Jiaxuan Gao", "Wenjie Ye", "Weilin Liu", "Zhiyu Mei", "Guangju Wang", "Chao Yu", "Yi Wu"],
    477       "year": 2024,
    478       "arxiv_id": "2404.10719",
    479       "relevance": "Comprehensive comparison of alignment methods (DPO vs PPO) relevant to understanding the RLHF phase that I-SHEEP's limitations section references."
    480     },
    481     {
    482       "title": "Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing",
    483       "authors": ["Zhangchen Xu", "Fengqing Jiang", "Luyao Niu", "Yuntian Deng", "Radha Poovendran", "Yejin Choi", "Bill Yuchen Lin"],
    484       "year": 2024,
    485       "arxiv_id": "2406.08464",
    486       "relevance": "Related approach for alignment data synthesis without human annotation, directly relevant to the self-alignment paradigm explored in this paper."
    487     },
    488     {
    489       "title": "Iterative Reasoning Preference Optimization",
    490       "authors": ["Richard Yuanzhe Pang", "Weizhe Yuan", "Kyunghyun Cho", "He He", "Sainbayar Sukhbaatar", "Jason Weston"],
    491       "year": 2024,
    492       "arxiv_id": "2404.19733",
    493       "relevance": "Iterative preference optimization approach for continuous model alignment, relevant to iterative enhancement methods."
    494     },
    495     {
    496       "title": "LLM2LLM: Boosting LLMs with Novel Iterative Data Enhancement",
    497       "authors": ["Nicholas Lee", "Thanakul Wattanawong", "Sehoon Kim", "Karttikeya Mangalam", "Sheng Shen", "Gopala Anumanchipali", "Michael W. Mahoney", "Kurt Keutzer", "Amir Gholami"],
    498       "year": 2024,
    499       "arxiv_id": "2403.15042",
    500       "relevance": "Iterative data enhancement method for LLMs using strong model feedback, relevant baseline for understanding external-signal-dependent iterative enhancement."
    501     },
    502     {
    503       "title": "Evaluating Large Language Models Trained on Code",
    504       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    505       "year": 2021,
    506       "arxiv_id": "2107.03374",
    507       "relevance": "Introduces HumanEval benchmark used as a key evaluation metric in this paper's experiments."
    508     },
    509     {
    510       "title": "LIMA: Less Is More for Alignment",
    511       "authors": ["Chunting Zhou", "Pengfei Liu", "Puxin Xu"],
    512       "year": 2024,
    513       "relevance": "Demonstrates that small amounts of high-quality data can effectively align LLMs, directly relevant to I-SHEEP's low-resource alignment premise."
    514     }
    515   ],
    516   "engagement_factors": {
    517     "practical_relevance": {
    518       "score": 2,
    519       "justification": "Practitioners could use I-SHEEP to self-improve base models without external data or APIs, making it relevant for resource-constrained fine-tuning scenarios."
    520     },
    521     "surprise_contrarian": {
    522       "score": 1,
    523       "justification": "Iterative self-improvement builds on existing self-instruct ideas; the iterative extension is incremental rather than paradigm-shifting."
    524     },
    525     "fear_safety": {
    526       "score": 0,
    527       "justification": "No safety or security concerns raised; the paper focuses on improving model quality through self-generated data."
    528     },
    529     "drama_conflict": {
    530       "score": 0,
    531       "justification": "No controversial claims or conflicts with established results; the paper presents a straightforward extension of prior work."
    532     },
    533     "demo_ability": {
    534       "score": 1,
    535       "justification": "Code is claimed available via anonymous link but requires significant compute (8×A800 GPUs) to reproduce, limiting hands-on experimentation."
    536     },
    537     "brand_recognition": {
    538       "score": 1,
    539       "justification": "Uses well-known models (Qwen, Llama-3) but authors are from academic institutions and smaller companies (01.ai, M-A-P), not top-tier AI labs."
    540     }
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs