ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25520B)


      1 {
      2   "paper": {
      3     "title": "Steering LLMs via Scalable Interactive Oversight",
      4     "authors": [
      5       "Enyu Zhou",
      6       "Zhiheng Xi",
      7       "Long Ma",
      8       "Zhihao Zhang",
      9       "Shihan Dou",
     10       "Zhikai Lei",
     11       "Guoteng Wang",
     12       "Rui Zheng",
     13       "Hang Yan",
     14       "Tao Gui",
     15       "Qi Zhang",
     16       "Xuanjing Huang"
     17     ],
     18     "year": 2026,
     19     "venue": "arXiv",
     20     "arxiv_id": "2602.04210"
     21   },
     22   "scan_version": 2,
     23   "active_modules": ["experimental_rigor"],
     24   "methodology_tags": ["benchmark-eval", "case-study"],
     25   "key_findings": "The Scalable Interactive Oversight framework decomposes complex user intent into a recursive tree of manageable decisions, enabling non-experts to produce expert-level PRDs with up to 54% alignment improvement over vanilla interaction baselines. The framework's interaction signals can serve as rewards for RL training, with both user-only and combined user+expert rewards improving alignment. RL training also generalizes to untrained modules and unseen model configurations, and improves interaction efficiency by reducing the number of turns needed.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No repository URL or code release is mentioned anywhere in the paper."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The 37-case test dataset and 700 SFT training samples are described but no download link or release is provided."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No environment specifications, dependency lists, or Dockerfiles are provided. The paper mentions Docker for baselines but does not provide reproducible environment details."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No reproduction instructions or scripts are provided. The experimental setup is described in prose but lacks step-by-step reproduction guidance."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Results in Tables 1, 2, 3 are reported as point estimates without confidence intervals or error bars."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper claims improvements (e.g., 54%, 33%, 39%) over baselines but no statistical significance tests are reported."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Percentage improvements are reported with baseline context, e.g., '0.359 to 0.554, corresponding to a +54% relative gain' (Section 4.2), and Table 1 provides absolute scores enabling comparison."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The test set is 37 cases and the real-user study is 10 cases. No justification for these sample sizes or power analysis is provided."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Figure 6 mentions 'average across three test runs' but no standard deviations or spread measures are reported in any table. Single-point results throughout."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Two baselines are compared: (1) direct PRD generation via vibe coding frameworks (Codex, Claude Code, Gemini CLI) and (2) vanilla multi-turn interaction (Section 4.1, Table 1)."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Baselines include GPT-5, claude-sonnet-4.5, and Gemini-2.5-pro, as well as Codex, Claude Code, and Gemini CLI — all contemporary tools."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 4.3 provides ablation: low-burden feedback alone, tree updating with lightweight model (o4-mini) vs full model (GPT-5), showing incremental contributions. Section 5.3/Table 5 ablates reward components."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Per-module alignment scores (5 modules) are reported separately and as averages. Both LLM-judge and human-judge evaluations are used (Table 2)."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Table 2 reports human-judge scores for deployed websites. Section 4.4 describes a real-user study with human interaction. Human annotators validate rubrics (Appendix A.2) and user simulation (Appendix A.3)."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "37 test cases are sampled for test-time validation (Section 4.1). RL training uses separate data, and Table 3 tests on both same-as-training and unseen (GPT-5) settings."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table 1 breaks down results by all 5 PRD modules. Table 3 similarly shows per-module results including untrained modules M3-M5."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Figure 4 (middle) shows the system handling ambiguous/uncertain user inputs, including DontCare and DontKnow responses. Section 4.2 notes Modules 3-5 'generally exhibit lower [scores] across methods.'"
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "Every configuration shows improvement over baselines. No failed approaches, abandoned configurations, or scenarios where the method underperforms are reported."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The abstract claims 54% improvement in alignment, which matches the Gemini-2.5-pro result in Table 1 (0.359→0.554). The RL optimization claim is supported by Section 5 experiments."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The ablation study (Section 4.3) uses controlled single-variable manipulation to isolate component contributions. Causal language like 'improves' is supported by ablation design."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper tests only on website PRD generation but uses broad framing like 'Scalable Interactive Oversight' and 'complex, long-horizon tasks.' Section 7 mentions 'broader real-user evaluations' as future work but the title and framing are not bounded to web development."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No discussion of alternative explanations. The improvement could be due to more interaction turns rather than the tree structure specifically, or due to the user simulator being biased toward the framework's interaction style. These confounds are not addressed."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper explicitly frames PRD alignment as a proxy for end-to-end software alignment. Section 2.2 explains why PRD was chosen as an 'evaluation pivot' and Table 2 separately evaluates downstream website implementations, acknowledging the gap."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Models are referred to by marketing names only: 'GPT-5', 'claude-sonnet-4.5', 'gemini-2.5-pro', 'deepseek-R1', 'Qwen3-235B-A22B-Instruct', 'Qwen3-30B-A3B', 'o4-mini'. No API versions or snapshot dates are provided."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Full prompt texts for all major components are provided in Appendix C: interaction model (C.1), tree initialization (C.2), tree updating (C.3), document generator (C.4), user simulation (C.5), evaluation (C.6), progressive reward (C.7), and rubrics generation (C.8)."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Appendix A.4 reports RL hyperparameters: learning rate 2e-6, batch size 8, clipping range [0.8, 1.2], PPO epochs 1, rollout size 4, SFT learning rate 5e-5. However, LLM API settings (temperature) are not stated."
    165       },
    166       "scaffolding_described": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The tree-based interaction scaffold is described in detail in Section 3 and Algorithm 1: decomposition initialization, node-level interaction, tree updating, depth-first traversal. Full workflow is formalized."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4.1 describes the dataset construction: crawling production websites for UI components, using search-augmented LLM for additional info, generating structured PRDs with LLMs, and synthesizing initial user requests. The pipeline is documented."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7 (Conclusion & Future Work) and the Impact Statement discuss limitations including early misunderstanding amplification, non-applicability to safety-critical domains, and limited real-user evaluation."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The Impact Statement notes specific threats: 'accumulated preferences may amplify early misunderstandings or biases in user intent' and 'the framework is not intended for safety-critical domains.' Section 7 acknowledges limited real-user evaluation and scope to requirements only."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 7 explicitly states: 'our work focuses on requirement-level oversight and does not fully study code-level supervision.' The Impact Statement notes it is 'not intended for safety-critical domains.'"
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No raw data (PRDs, interaction traces, rubrics, evaluation results) is released for independent verification."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 4.1 describes crawling production websites for UI components, using search-augmented LLM for additional info, then generating structured PRDs. Appendix A.3 describes user simulation validation on a 272-sample test set."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The real-user study (Section 4.4) says 'we hired a non-expert' with no description of recruitment method, selection criteria, or how the person was chosen. Human annotators in Appendix A.2-A.3 are similarly undescribed."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline from website crawling → PRD generation → query synthesis → interaction → evaluation is documented across Sections 4.1 and Appendix A.1-A.2."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding information or acknowledgments section is present in the paper."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are clearly listed: Fudan University and Shanghai Qiji Zhifeng Co., Ltd."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding is disclosed, so independence cannot be assessed. Some authors are from Shanghai Qiji Zhifeng Co., Ltd., a commercial entity whose relationship to the research outcomes is not discussed."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests statement or financial disclosures are present in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "Multiple LLMs are used (GPT-5, Claude Sonnet 4.5, Gemini 2.5 Pro, DeepSeek R1) but none have their training cutoff dates stated. The benchmark is constructed from real websites that may exist in training data."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The PRDs are generated from crawled production websites. These websites and their descriptions likely exist in LLM training data, but this overlap is not discussed."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "The evaluation uses LLM-generated PRDs as ground truth, and LLMs as judges. The circular dependency (LLMs generating targets that LLMs evaluate against) is not discussed as a contamination concern."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "The real-user study (Section 4.4) is not pre-registered."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "No IRB or ethics approval is mentioned for the real-user study or human annotation tasks."
    265       },
    266       "demographics_reported": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "The real user is described only as 'a non-expert.' No demographics, background, or experience level are reported. Human annotators are similarly uncharacterized."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inclusion or exclusion criteria are stated for the hired non-expert or human annotators."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "The real-user study is not a randomized experiment — it is a single-user case study with no control condition requiring randomization."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No blinding is applicable — the user directly interacts with the system in a single-arm study."
    285       },
    286       "attrition_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Section 4.4 says 10 cases were conducted due to 'resource limits' but does not report whether any were excluded or failed."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The framework calls multiple LLMs (GPT-5, Gemini, etc.) across many interaction nodes, tree updates, and document generation steps. No API costs, token counts, or per-example costs are reported."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "RL training uses Qwen3-30B-A3B but no GPU hours, training time, or total compute budget is stated."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Figure 6 caption mentions 'average across three test runs' but no seed sensitivity analysis or variance across seeds is reported."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "Figure 6 states 'average across three test runs.' Appendix A.4 states rollout size of 4."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Appendix A.4 reports final hyperparameters and mentions that rollout size 4 'achieved the best empirical performance,' implying search, but no search budget or configurations tried are reported."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper states rollout size 4 achieved 'best empirical performance' but does not explain the selection criterion or report results for other configurations."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": false,
    327         "answer": false,
    328         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors evaluate their own framework against baselines they implemented. No acknowledgment of author-evaluation bias is made."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The proposed method uses substantially more compute than baselines (multiple LLM calls per node, tree updates, multi-turn interaction vs single-turn generation) but this compute difference is not discussed or controlled for."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "The paper validates LLM-judge agreement across 3 models (Table 4, 87-90% agreement) and against human annotators (0.5% unreasonable rubrics). Section 2.2 justifies PRD as evaluation pivot. User simulator is validated against humans (Appendix A.3)."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The framework adds significant scaffolding (tree decomposition, preference accumulation) on top of the same base models as baselines. The improvement could be attributed to more structured interaction rather than the specific tree mechanism, but this confound is not isolated."
    349       }
    350     }
    351   },
    352   "claims": [
    353     {
    354       "claim": "Scalable Interactive Oversight achieves 54% improvement in alignment over vanilla interaction on Gemini-2.5-pro",
    355       "evidence": "Table 1: average score increases from 0.359 (vanilla interaction) to 0.554 (Ours) on Gemini-2.5-pro",
    356       "supported": "moderate"
    357     },
    358     {
    359       "claim": "The framework improves alignment by 33% over vanilla interaction and 39% over Codex on GPT-5",
    360       "evidence": "Table 1: GPT-5 scores of 0.503 (vanilla), 0.481 (Codex), 0.670 (Ours)",
    361       "supported": "moderate"
    362     },
    363     {
    364       "claim": "Alignment scales with interaction — more interaction nodes yield higher alignment scores",
    365       "evidence": "Figure 2 shows increasing alignment scores over interaction nodes for both simulated and human users",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Online user feedback alone (DontCare penalty) is sufficient to improve system alignment through RL",
    370       "evidence": "Figure 6(a) and Table 3: User Reward improves over SFT baseline (0.532→0.536 same setting, 0.616→0.629 GPT-5 setting)",
    371       "supported": "weak"
    372     },
    373     {
    374       "claim": "RL training generalizes to untrained modules (M3-M5) and unseen model configurations (GPT-5)",
    375       "evidence": "Table 3: M3-M5 average improves from 0.500 to 0.518 (same setting) and 0.606 to 0.640 (GPT-5 setting) under User+Expert Reward",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "RL training improves interaction efficiency by reducing the number of interaction turns",
    380       "evidence": "Figure 7 shows decreasing total turns and per-node turns over training steps",
    381       "supported": "moderate"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "Simulated users as primary evaluation",
    387       "detail": "The main experiments (Table 1, Table 3) use a simulated user (DeepSeek R1) rather than real humans. The simulator's agreement with humans is only 0.677 (Appendix A.3), meaning ~1/3 of responses diverge from human behavior. The simulator may systematically favor the structured interaction format."
    388     },
    389     {
    390       "flag": "Circular LLM evaluation",
    391       "detail": "LLMs generate the ground-truth PRDs, LLMs simulate users, and LLMs judge alignment. This creates a circular dependency where the evaluation may reward LLM-preferred outputs rather than genuinely aligned ones."
    392     },
    393     {
    394       "flag": "Tiny real-user study",
    395       "detail": "Only 1 non-expert on 10 cases (Section 4.4), far too small to draw conclusions about general user behavior. No demographics, selection criteria, or statistical analysis reported."
    396     },
    397     {
    398       "flag": "No statistical significance testing",
    399       "detail": "All comparative claims are based on point estimate differences without any significance tests. With only 37 test cases, many observed differences may not be statistically significant."
    400     },
    401     {
    402       "flag": "Compute fairness not controlled",
    403       "detail": "The proposed method makes many more LLM calls than baselines (multi-turn interaction + tree updates + document generation vs single-turn generation). The improvement could partly be due to spending more compute rather than the framework design."
    404     },
    405     {
    406       "flag": "No cost reporting for a multi-LLM pipeline",
    407       "detail": "The framework calls GPT-5, Gemini, o4-mini, and other models across multiple interaction nodes. No cost analysis is provided, making practical feasibility impossible to assess."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Measuring the impact of early-2025 ai on experienced open-source developer productivity",
    413       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    414       "year": 2025,
    415       "arxiv_id": "2507.09089",
    416       "relevance": "Empirical study showing AI collaboration increases task-completion time for 19% of cases, directly relevant to AI productivity evaluation."
    417     },
    418     {
    419       "title": "Weak-to-strong generalization: Eliciting strong capabilities with weak supervision",
    420       "authors": ["Collin Burns", "Pavel Izmailov", "Jan Hendrik Kirchner"],
    421       "year": 2023,
    422       "arxiv_id": "2312.09390",
    423       "relevance": "Foundational work on weak-to-strong supervision paradigm, core theoretical motivation for this paper's approach."
    424     },
    425     {
    426       "title": "Measuring progress on scalable oversight for large language models",
    427       "authors": ["Samuel R. Bowman"],
    428       "year": 2022,
    429       "arxiv_id": "2211.03540",
    430       "relevance": "Defines the sandwich protocol used in this paper's evaluation framework for scalable oversight."
    431     },
    432     {
    433       "title": "A survey of vibe coding with large language models",
    434       "authors": ["Yuyao Ge"],
    435       "year": 2025,
    436       "arxiv_id": "2510.12399",
    437       "relevance": "Survey of vibe coding paradigm that this paper addresses the oversight challenges of."
    438     },
    439     {
    440       "title": "Supervising strong learners by amplifying weak humans",
    441       "authors": ["Paul Christiano", "Jan Leike", "Tom B. Brown"],
    442       "year": 2018,
    443       "arxiv_id": "1810.08575",
    444       "relevance": "Core scalable oversight work on recursive amplification that inspires this paper's tree-based approach."
    445     },
    446     {
    447       "title": "Recursively summarizing books with human feedback",
    448       "authors": ["Jeff Wu", "Long Ouyang", "Daniel M. Ziegler"],
    449       "year": 2021,
    450       "arxiv_id": "2109.10862",
    451       "relevance": "Demonstrates recursive decomposition for scalable oversight in long-form text, methodological predecessor."
    452     },
    453     {
    454       "title": "Constitutional ai: Harmlessness from ai feedback",
    455       "authors": ["Yuntao Bai"],
    456       "year": 2022,
    457       "arxiv_id": "2212.08073",
    458       "relevance": "AI critique approach to alignment, one of the scalable oversight paradigms discussed."
    459     },
    460     {
    461       "title": "How developers interact with ai: A taxonomy of human-ai collaboration in software engineering",
    462       "authors": ["Christoph Treude", "Marco A Gerosa"],
    463       "year": 2025,
    464       "relevance": "Taxonomy of human-AI collaboration patterns in SE, relevant to understanding developer-AI interaction."
    465     },
    466     {
    467       "title": "Training language models to follow instructions with human feedback",
    468       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    469       "year": 2022,
    470       "relevance": "RLHF methodology that this paper extends with online interactive feedback signals."
    471     },
    472     {
    473       "title": "AI safety via debate",
    474       "authors": ["Geoffrey Irving", "Paul Christiano", "Dario Amodei"],
    475       "year": 2018,
    476       "arxiv_id": "1805.00899",
    477       "relevance": "Debate-based scalable oversight approach discussed as related work."
    478     }
    479   ]
    480 }

Impressum · Datenschutz