scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28256B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "Towards Scalable Oversight via Partitioned Human Supervision",
      6     "authors": ["Ren Yin", "Takashi Ishida", "Masashi Sugiyama"],
      7     "year": 2025,
      8     "venue": "ICLR 2026",
      9     "arxiv_id": "2510.22500"
     10   },
     11   "methodology_tags": ["theoretical", "benchmark-eval"],
     12   "key_findings": "The paper proposes using complementary labels (indicating incorrect options) from domain specialists as a scalable oversight mechanism for evaluating and training AI systems on superhuman tasks. They derive an unbiased estimator of top-1 accuracy from complementary labels alone, show how to combine them with scarce ordinary labels via IVW and ML mixture estimators, and provide finite-sample deviation guarantees. Empirically, they demonstrate these estimators work on LLM benchmarks (MMLU-Pro, MedQA, GPQA, MATH) and that complementary-label fitness signals can drive agentic system design (ADAS, AFlow) to outperform manually designed baselines.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The abstract states 'Our code is available at [link]' and the Reproducibility Statement confirms a public GitHub repository with scripts to reproduce experiments."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available benchmarks (MMLU-Pro, MedQA-USMLE, GPQA, MATH, EDINET-Bench, Medical Abstracts). EDINET-Bench Extended is constructed from released code (Sugiura et al., 2026)."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No mention of requirements.txt, Dockerfile, or detailed environment specifications in the paper. The reproducibility statement refers to the repository but does not list dependencies."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The Reproducibility Statement says 'Code and scripts to reproduce our experiments are available in our public GitHub repository' with full implementation details and evaluation scripts."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Tables 1 and 2 report results as 'mean ± standard deviation across three random seeds' with the average of per-run standard deviations also shown in parentheses."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper provides finite-sample deviation bounds (Theorems 2, 4, Proposition 3) but does not apply statistical significance tests to compare estimators against each other or baselines. Comparisons in Tables 1-2 are based on point estimates and standard deviations only."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Table 3 (Appendix L) reports absolute deviations from the Ord-Eval oracle in percentage points for each estimator. Tables 1-2 provide accuracy values with context for all conditions. The variance ratio formula (Eq. 4) quantifies the efficiency gap."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Eq. (5) derives how many complementary labels are needed to match the variance of ordinary labels. Sample sizes are set at no=300 (120 for GPQA due to dataset size) with explicit justification. The variance-matching analysis directly addresses sample size requirements."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Tables 1 and 2 report both across-seed variability (±) and average within-run standard deviations (in parentheses) across three independent runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Multiple baselines are included: ordinary-label estimator, complementary-only at same sample size, variance-matched complementary, fixed 0.5/0.5 weighting, plus Ord-Eval oracle reference. For agentic training (§3.3), seven manually designed baselines (COT, COT-SC, Self-Refine, LLM Debate, etc.)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Agentic baselines include ADAS (Hu et al., 2025) and AFlow (Zhang et al., 2025b), both recent. Manually designed baselines are all from 2022-2025. Estimator baselines are the natural comparisons for this setting."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 3.1 includes an ablation on IVW weights (Figure 2), varying nc from no to 20no on MedQA-USMLE, showing how optimal weight changes with complementary label quantity. Appendix K ablates raw vs. transformed complementary accuracy."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports mean accuracy, standard deviation (across seeds), average per-run standard deviation, and absolute deviation from Ord-Eval oracle (Table 3). Estimator variance is analyzed separately."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "The paper evaluates statistical estimators and agentic system performance on benchmarks. Human evaluation of system outputs is not relevant to the claims about estimator properties."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "For agentic training (§3.3, Appendix J): 'we constructed validation and test splits for each benchmark' with explicit sizes (e.g., 120 validation and 800 test for Math-MC). ADAS/AFlow use validation for search and report on test."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are reported separately for each benchmark (MMLU-Pro, MedQA-USMLE, GPQA, MATH, MATH-CoT, EDINET, EDINET Extended, Medical Abstract) in Tables 1 and 2. Figure 3 breaks down agentic results per benchmark."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses when the transform hurts performance (GPQA, the hardest benchmark), where variance amplification is problematic (Appendix K/Figure 4), and limitations of EDINET-Bench with K=16 industries and low accuracy amplifying variance."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Appendix K shows that on GPQA, the transformed complementary accuracy hurts ADAS (drops from 47.2 to 42.4). The paper discusses when variance amplification makes the transform detrimental and notes the Comp-no estimator is 'unreliable for reproducible evaluation.'"
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims: unbiased estimator (Corollary 1, proved), variance characterization (Eq. 3-5), mixed estimators with finite-sample guarantees (Theorems 2, 4), evaluation without ground truth (Tables 1-2), and agentic training with weak signals (Figure 3). All supported by results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper makes causal claims about complementary labels improving agentic system design. The experimental setup uses controlled comparisons (same benchmarks, validation/test splits) and ablations. The estimator properties are derived mathematically. The language 'improves downstream performance' is supported by controlled ADAS/AFlow experiments."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The Discussion section explicitly discusses assumptions (uniform wrong-index, multiple-choice format) and when they may not hold. Limitations section acknowledges the framework may not apply to open-ended problems or when complementary labels are scarce. The paper scopes claims to multiple-choice evaluation."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 5 discusses biased answers from experts, uncertain answers (noise), overlaps between expert fields, and structural assumptions about multiple-choice format. It acknowledges these could affect the framework and cites relevant extensions (biased complementary labels, noisy complementary labels, multiple complementary labels)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper clearly measures top-1 accuracy and claims to estimate top-1 accuracy. It distinguishes between 'weakly-correct' (avoiding complementary label, footnote 1) and actual correctness. The estimator's relationship to true accuracy is formalized mathematically. No proxy gap exists between measurement and claim."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper specifies 'gpt-5-nano' (OpenAI, 2025b) and 'GPT-4.1-nano' for the MATH benchmark, with explicit references to the API documentation. These are specific model identifiers."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix J provides the full prompt text used for AFlow (the multiple-choice answer extraction prompt). The paper states ADAS prompts were adapted and 'uploaded the full text of the adapted prompts and scripts in the supplementary material.'"
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No mention of temperature, top-p, or other API sampling parameters for the LLM evaluations. The paper uses GPT-5-nano and GPT-4.1-nano via API but does not state sampling settings."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The agentic scaffolding follows ADAS (Hu et al., 2025) and AFlow (Zhang et al., 2025b) official implementations. Appendix J describes modifications. The evolved agent workflows are visualized in Figure 5 (Appendix M). Seven baseline agentic architectures are described."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix C describes the full labeling protocol (Algorithm 1) including option shuffling, complementary label generation, and the annotation interface design. Appendix H documents benchmark preparation: MMLU-Pro filtered to 10-option questions (9,795 examples), MATH-MC construction from MATH dataset (11,751 examples after filtering), duplicate removal."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 is titled 'Discussion & Limitations' with two substantial subsections: 'Assumptions and extensions' and 'Limitations.' These discuss structural assumptions, extensions needed, and cases where the framework breaks down."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5 discusses specific threats: biased expert answers, uncertain annotations, overlap between expert fields, the multiple-choice format assumption, and scenarios where complementary labels are scarce or inapplicable. These are specific to this study's design."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The Limitations subsection explicitly states: the framework requires complementary labels (may be inapplicable for open-ended problems), assumes human feedback is available, and cannot handle cases where 'the boundary of the ground truth becomes ambiguous.' The multiple-choice format requirement is clearly scoped."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The benchmarks used are publicly available (MMLU-Pro, MedQA-USMLE, GPQA, MATH, EDINET-Bench, Medical Abstracts). The code repository contains evaluation scripts. The complementary labels are generated synthetically from oracle labels following the documented protocol."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix C provides Algorithm 1 detailing the full data collection protocol. Appendix H describes each benchmark's construction. The complementary label generation procedure is formalized mathematically (Eq. 1) and procedurally (Algorithm 1)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants — all experiments use standard public benchmarks with synthetically generated complementary labels. The 'partitioned human supervision' is a proposed protocol, not one executed with actual human annotators in these experiments."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Algorithm 1 documents the full pipeline from MCQ items through option shuffling, annotator routing, label recording, to estimator computation. Appendix H documents benchmark filtering (e.g., MMLU-Pro: 10-option only → 9,795; MATH-MC: discard <4 distractors → 11,751)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Acknowledgements section: 'RY was supported by the RIKEN-AIP Undergraduate Research Program, TI was supported by JST ASPIRE Grant Number JPMJAP25B1, and MS was supported by JST ASPIRE Grant Number JPMJAP2405.'"
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: The University of Tokyo and RIKEN. These are academic institutions, not companies whose products are being evaluated."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Funders are RIKEN-AIP and JST ASPIRE (Japanese government research funding agencies). These have no financial stake in the outcome of complementary-label estimator research."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is included in the paper. While the authors are at academic institutions, the absence of an explicit declaration means this criterion is not satisfied."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper uses GPT-5-nano and GPT-4.1-nano on public benchmarks (MMLU-Pro, MedQA, GPQA, MATH) but does not state training data cutoff dates for these models."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether GPT-5-nano or GPT-4.1-nano may have been trained on the benchmark datasets. MMLU-Pro, MedQA, GPQA, and MATH are all publicly available and could be in training data."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper uses established benchmarks (some from 2021-2024) with models released in 2025, but does not discuss contamination risk. The Reproducibility Statement notes 'exact replication may vary over time if models are updated' but does not address data contamination."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in the experiments. The complementary labels are synthetically generated from oracle labels."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. The paper proposes a human annotation protocol but does not execute it with actual humans."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in the experiments."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The paper calls GPT-5-nano and GPT-4.1-nano APIs across multiple benchmarks, seeds, and estimator configurations but does not report API costs, tokens consumed, or wall-clock time."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No mention of total API spend, compute hours, or hardware used for the experiments."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Tables 1 and 2 report results 'across three random seeds' with both across-seed variability and within-run standard deviations."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Explicitly stated: 'three independent runs averaged' (Section 3.1). Appendix J specifies 'number of validation runs to 3' for AFlow."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "For ADAS and AFlow agent search, no mention of how many configurations were explored, search budget, or total compute for the agent design search."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "For agentic experiments: 'computing both complementary and transformed complementary accuracy on the validation set and reporting the better-performing variant' (§3.3). Selection is on validation, not test."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Multiple estimators are compared across multiple benchmarks without any correction for multiple comparisons."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors propose the complementary-label estimators and evaluate them. No discussion of self-comparison bias. However, the baselines (ADAS, AFlow) use official implementations, which partially mitigates this for the agentic experiments."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No analysis of compute cost vs. performance. The variance-matching analysis (Eq. 5) addresses sample efficiency but not computational cost."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper discusses construct validity: GPQA is noted as having substantial uncertainty due to small dataset size, GPT-5-nano was avoided for MATH due to ceiling effects, and the paper explicitly discusses when multiple-choice format may not capture the target construct (Section 5)."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "For agentic experiments, the paper compares ADAS and AFlow against baselines using the same underlying model. The scaffolding IS the variable being tested (can complementary labels drive agent search?), and the comparison is fair within each framework."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether GPT-5-nano or GPT-4.1-nano training data includes benchmark solutions. MMLU-Pro, MedQA, GPQA, and MATH all predate these models."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information. The complementary label generation is from oracle labels, which is acknowledged, but standard benchmark leakage through training data is not discussed."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of train/test independence for the underlying LLM benchmarks."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection or prevention methods applied despite using public benchmarks with potentially contaminated models."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "The complementary-label estimator (Eq. 2) is an unbiased estimator of top-1 accuracy under the uniform wrong-index assumption.",
    364       "evidence": "Corollary 1 with proof in Appendix A, connecting to the risk-rewrite identity of Ishida et al. (2019) in Appendix B.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "IVW and ML mixture estimators consistently achieve lower variance than single-source estimators and approximate the Ord-Eval reference most reliably.",
    369       "evidence": "Tables 1 and 2 show IVW and ML have the lowest within-run standard deviations across benchmarks. Table 3 shows lowest average deviation from Ord-Eval (IVW: 1.42, ML: 1.48 vs Ord: 1.92).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Agentic systems guided by complementary-label signals (ADAS and AFlow) consistently outperform manually designed baselines.",
    374       "evidence": "Figure 3 shows ADAS and AFlow outperforming COT, COT-SC, Self-Refine, LLM Debate, Step-back, QD, and Role Assignment on GPQA, Math-MC, and Medical Abstracts.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "The complementary-label estimator requires nc = (1 + (K-2)/A) × no complementary labels to match the variance of no ordinary labels.",
    379       "evidence": "Derived in Eq. (5) from the variance analysis in Eq. (3)-(4). Empirically validated in the Comp-Var rows of Table 1.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Finite-sample deviation bounds hold for both complementary-only and mixed estimators.",
    384       "evidence": "Theorem 2 (Hoeffding + empirical Bernstein for complementary), Proposition 3 (union bound for mixture), Theorem 4 (Bernstein for mixture). Proofs in Appendices E and F.",
    385       "supported": "strong"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Synthetic complementary labels only",
    391       "detail": "All experiments generate complementary labels synthetically from oracle ground-truth labels. The paper proposes a human annotation protocol but never tests it with actual human annotators. The practical challenges of real partitioned human supervision (expert disagreement, overlapping expertise, biased rejections) are discussed but not empirically validated."
    392     },
    393     {
    394       "flag": "No contamination analysis",
    395       "detail": "The paper uses GPT-5-nano and GPT-4.1-nano on established public benchmarks (MMLU-Pro, MedQA, GPQA, MATH) without discussing whether these models were trained on the benchmark data. This is particularly relevant since the paper's central claim is about evaluation methodology — contaminated benchmarks would undermine the validation."
    396     },
    397     {
    398       "flag": "Limited agentic evaluation",
    399       "detail": "The agentic training experiments (§3.3) use only 3 benchmarks and report only test accuracy. No analysis of the evolved agents' generalization, robustness, or computational cost relative to baselines."
    400     }
    401   ],
    402   "cited_papers": [
    403     {
    404       "title": "Weak-to-Strong Generalization: Eliciting strong capabilities with weak supervision",
    405       "authors": ["Collin Burns", "Pavel Izmailov", "Jan Hendrik Kirchner"],
    406       "year": 2024,
    407       "relevance": "Core scalable oversight work studying whether strong models trained on weak supervisors can exceed those supervisors."
    408     },
    409     {
    410       "title": "Measuring progress on scalable oversight for large language models",
    411       "authors": ["Samuel R. Bowman"],
    412       "year": 2022,
    413       "arxiv_id": "2211.03540",
    414       "relevance": "Defines the scalable oversight problem and superalignment regime that this paper addresses."
    415     },
    416     {
    417       "title": "Automated design of agentic systems",
    418       "authors": ["Shengran Hu", "Cong Lu", "Jeff Clune"],
    419       "year": 2025,
    420       "relevance": "ADAS framework used in agentic training experiments; automated agent workflow design requiring fitness signals."
    421     },
    422     {
    423       "title": "AFlow: Automating agentic workflow generation",
    424       "authors": ["Jiayi Zhang"],
    425       "year": 2025,
    426       "relevance": "AFlow framework used in agentic training experiments for automated workflow optimization."
    427     },
    428     {
    429       "title": "Debating with More Persuasive LLMs Leads to More Truthful Answers",
    430       "authors": ["Akbir Khan"],
    431       "year": 2024,
    432       "relevance": "Debate-based scalable oversight protocol for eliciting truthful answers from AI systems."
    433     },
    434     {
    435       "title": "Constitutional AI: Harmlessness from AI feedback",
    436       "authors": ["Yuntao Bai"],
    437       "year": 2022,
    438       "arxiv_id": "2212.08073",
    439       "relevance": "RLAIF approach replacing human preferences with AI-generated feedback for alignment."
    440     },
    441     {
    442       "title": "Alignment faking in large language models",
    443       "authors": ["Ryan Greenblatt"],
    444       "year": 2024,
    445       "arxiv_id": "2412.14093",
    446       "relevance": "Demonstrates alignment techniques can be vulnerable to strategic behavior producing false sense of safety."
    447     },
    448     {
    449       "title": "Sleeper Agents: Training deceptive llms that persist through safety training",
    450       "authors": ["Evan Hubinger"],
    451       "year": 2024,
    452       "arxiv_id": "2401.05566",
    453       "relevance": "Demonstrates deceptive AI behavior persisting through safety training, motivating better oversight."
    454     },
    455     {
    456       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations",
    457       "authors": ["Qingyun Wu"],
    458       "year": 2024,
    459       "relevance": "Multi-agent LLM framework relevant to agentic AI system design and collaboration."
    460     },
    461     {
    462       "title": "Self-Refine: Iterative refinement with self-feedback",
    463       "authors": ["Aman Madaan"],
    464       "year": 2023,
    465       "relevance": "Agentic self-improvement baseline used in the experiments."
    466     },
    467     {
    468       "title": "DAPO: An open-source LLM reinforcement learning system at scale",
    469       "authors": ["Qiying Yu"],
    470       "year": 2025,
    471       "relevance": "Open-source RL system for LLM training, relevant to RLVR and alignment pipelines."
    472     },
    473     {
    474       "title": "Darwin Gödel Machine: Open-ended evolution of self-improving agents",
    475       "authors": ["Jenny Zhang"],
    476       "year": 2026,
    477       "relevance": "Self-improving agent architecture relevant to automated agentic system design."
    478     }
    479   ]
    480 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs